use crate::analysis::{
classification,
formula::{FormulaAtlas, FormulaGraph},
style,
};
use crate::caps::BackendCaps;
use crate::config::ServerConfig;
use crate::model::{
NamedItemKind, NamedRangeDescriptor, SheetClassification, SheetOverviewResponse, SheetSummary,
WorkbookDescription, WorkbookId, WorkbookListResponse,
};
use crate::tools::filters::WorkbookFilter;
use crate::utils::{
hash_bytes_sha256_hex, hash_file_sha256_hex, hash_path_identity, make_short_workbook_id,
path_to_forward_slashes, system_time_to_rfc3339,
};
use anyhow::{Context, Result, anyhow};
use chrono::{DateTime, Utc};
use parking_lot::RwLock;
use std::cmp::Ordering;
use std::collections::{HashMap, HashSet};
use std::fs;
use std::io::Cursor;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Instant;
use umya_spreadsheet::reader::xlsx;
use umya_spreadsheet::{DefinedName, Spreadsheet, Worksheet};
const KV_MAX_WIDTH_FOR_DENSITY_CHECK: u32 = 6;
const KV_SAMPLE_ROWS: u32 = 20;
const KV_DENSITY_THRESHOLD: f32 = 0.4;
const KV_CHECK_ROWS: u32 = 15;
const KV_MAX_LABEL_LEN: usize = 25;
const KV_MIN_TEXT_VALUE_LEN: usize = 2;
const KV_MIN_PAIRS: u32 = 3;
const KV_MIN_PAIR_RATIO: f32 = 0.3;
const HEADER_MAX_SCAN_ROWS: u32 = 2;
const HEADER_LONG_STRING_PENALTY_THRESHOLD: usize = 40;
const HEADER_LONG_STRING_PENALTY: f32 = 1.5;
const HEADER_PROPER_NOUN_MIN_LEN: usize = 5;
const HEADER_PROPER_NOUN_PENALTY: f32 = 1.0;
const HEADER_DIGIT_STRING_MIN_LEN: usize = 3;
const HEADER_DIGIT_STRING_PENALTY: f32 = 0.5;
const HEADER_DATE_PENALTY: f32 = 1.0;
const HEADER_YEAR_LIKE_BONUS: f32 = 0.5;
const HEADER_YEAR_MIN: f64 = 1900.0;
const HEADER_YEAR_MAX: f64 = 2100.0;
const HEADER_UNIQUE_BONUS: f32 = 0.2;
const HEADER_NUMBER_PENALTY: f32 = 0.3;
const HEADER_SINGLE_COL_MIN_SCORE: f32 = 1.5;
const HEADER_SCORE_TIE_THRESHOLD: f32 = 0.3;
const HEADER_SECOND_ROW_MIN_SCORE_RATIO: f32 = 0.6;
const HEADER_MAX_COLUMNS: u32 = 200;
const DETECT_MAX_ROWS: u32 = 10_000;
const DETECT_MAX_COLS: u32 = 500;
const DETECT_MAX_AREA: u64 = 5_000_000;
const DETECT_MAX_CELLS: usize = 200_000;
const DETECT_MAX_LEAVES: usize = 200;
const DETECT_MAX_DEPTH: u32 = 12;
const DETECT_MAX_MS: u64 = 200;
const DETECT_OUTLIER_FRACTION: f32 = 0.01;
const DETECT_OUTLIER_MIN_CELLS: usize = 50;
pub struct WorkbookContext {
pub id: WorkbookId,
pub short_id: String,
pub revision_id: String,
pub slug: String,
pub path: PathBuf,
pub caps: BackendCaps,
pub bytes: u64,
pub last_modified: Option<DateTime<Utc>>,
spreadsheet: Arc<RwLock<Spreadsheet>>,
sheet_cache: RwLock<HashMap<String, Arc<SheetCacheEntry>>>,
formula_atlas: Arc<FormulaAtlas>,
}
pub struct SheetCacheEntry {
pub metrics: SheetMetrics,
pub style_tags: Vec<String>,
pub named_ranges: Vec<NamedRangeDescriptor>,
detected_regions: RwLock<Option<Vec<crate::model::DetectedRegion>>>,
region_notes: RwLock<Vec<String>>,
}
#[derive(Debug, Clone)]
pub struct SheetMetrics {
pub row_count: u32,
pub column_count: u32,
pub non_empty_cells: u32,
pub formula_cells: u32,
pub cached_values: u32,
pub comments: u32,
pub style_map: HashMap<String, StyleUsage>,
pub classification: SheetClassification,
}
#[derive(Debug, Clone)]
pub struct StyleUsage {
pub occurrences: u32,
pub tags: Vec<String>,
pub example_cells: Vec<String>,
}
impl SheetCacheEntry {
pub fn detected_regions(&self) -> Vec<crate::model::DetectedRegion> {
self.detected_regions
.read()
.as_ref()
.cloned()
.unwrap_or_default()
}
pub fn region_notes(&self) -> Vec<String> {
self.region_notes.read().clone()
}
pub fn has_detected_regions(&self) -> bool {
self.detected_regions.read().is_some()
}
pub fn set_detected_regions(&self, regions: Vec<crate::model::DetectedRegion>) {
let mut guard = self.detected_regions.write();
if guard.is_none() {
*guard = Some(regions);
}
}
pub fn set_region_notes(&self, notes: Vec<String>) {
if notes.is_empty() {
return;
}
let mut guard = self.region_notes.write();
if guard.is_empty() {
*guard = notes;
}
}
}
impl WorkbookContext {
pub fn load(_config: &Arc<ServerConfig>, path: &Path) -> Result<Self> {
fs::metadata(path).with_context(|| format!("unable to read metadata for {:?}", path))?;
let canonical = fs::canonicalize(path).unwrap_or_else(|_| path.to_path_buf());
let slug = path
.file_stem()
.map(|s| s.to_string_lossy().to_string())
.unwrap_or_else(|| "workbook".to_string());
let id = WorkbookId(hash_path_identity(&canonical));
let short_id = make_short_workbook_id(&slug, id.as_str());
let revision_id = hash_file_sha256_hex(path)
.with_context(|| format!("unable to hash workbook {:?}", path))?;
Self::load_from_path(_config, path, id, short_id, Some(revision_id))
}
pub fn load_from_path(
_config: &Arc<ServerConfig>,
path: &Path,
stable_id: WorkbookId,
short_id: String,
revision_id: Option<String>,
) -> Result<Self> {
let metadata = fs::metadata(path)
.with_context(|| format!("unable to read metadata for {:?}", path))?;
let slug = path
.file_stem()
.map(|s| s.to_string_lossy().to_string())
.unwrap_or_else(|| "workbook".to_string());
let bytes = metadata.len();
let last_modified = metadata.modified().ok().and_then(system_time_to_rfc3339);
let revision_id = match revision_id {
Some(id) => id,
None => hash_file_sha256_hex(path)
.with_context(|| format!("unable to hash workbook {:?}", path))?,
};
let spreadsheet =
xlsx::read(path).with_context(|| format!("failed to parse workbook {:?}", path))?;
Ok(Self {
id: stable_id,
short_id,
revision_id,
slug,
path: path.to_path_buf(),
caps: BackendCaps::xlsx(),
bytes,
last_modified,
spreadsheet: Arc::new(RwLock::new(spreadsheet)),
sheet_cache: RwLock::new(HashMap::new()),
formula_atlas: Arc::new(FormulaAtlas::default()),
})
}
pub fn load_from_bytes(
_config: &Arc<ServerConfig>,
display_name: &str,
bytes: &[u8],
stable_id: WorkbookId,
short_id: String,
revision_id: Option<String>,
) -> Result<Self> {
let slug = Path::new(display_name)
.file_stem()
.map(|s| s.to_string_lossy().to_string())
.unwrap_or_else(|| "workbook".to_string());
let cursor = Cursor::new(bytes);
let spreadsheet = xlsx::read_reader(cursor, true)
.with_context(|| format!("failed to parse workbook bytes for {display_name}"))?;
let revision_id = revision_id.unwrap_or_else(|| hash_bytes_sha256_hex(bytes));
Ok(Self {
id: stable_id,
short_id,
revision_id,
slug,
path: PathBuf::from(format!("virtual/{display_name}")),
caps: BackendCaps::xlsx(),
bytes: bytes.len() as u64,
last_modified: None,
spreadsheet: Arc::new(RwLock::new(spreadsheet)),
sheet_cache: RwLock::new(HashMap::new()),
formula_atlas: Arc::new(FormulaAtlas::default()),
})
}
pub fn sheet_names(&self) -> Vec<String> {
let book = self.spreadsheet.read();
book.get_sheet_collection()
.iter()
.map(|sheet| sheet.get_name().to_string())
.collect()
}
pub fn describe(&self) -> WorkbookDescription {
let book = self.spreadsheet.read();
let defined_names_count = book.get_defined_names().len();
let table_count: usize = book
.get_sheet_collection()
.iter()
.map(|sheet| sheet.get_tables().len())
.sum();
let macros_present = false;
WorkbookDescription {
workbook_id: self.id.clone(),
short_id: self.short_id.clone(),
slug: self.slug.clone(),
path: path_to_forward_slashes(&self.path),
client_path: None,
bytes: self.bytes,
sheet_count: book.get_sheet_collection().len(),
defined_names: defined_names_count,
tables: table_count,
macros_present,
last_modified: self
.last_modified
.map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
revision_id: Some(self.revision_id.clone()),
caps: self.caps.clone(),
}
}
pub fn get_sheet_metrics_fast(&self, sheet_name: &str) -> Result<Arc<SheetCacheEntry>> {
if let Some(entry) = self.sheet_cache.read().get(sheet_name) {
return Ok(entry.clone());
}
let mut writer = self.sheet_cache.write();
if let Some(entry) = writer.get(sheet_name) {
return Ok(entry.clone());
}
let book = self.spreadsheet.read();
let sheet = book
.get_sheet_by_name(sheet_name)
.ok_or_else(|| anyhow!("sheet {} not found", sheet_name))?;
let (metrics, style_tags) = compute_sheet_metrics(sheet);
let named_ranges = gather_named_ranges(sheet, book.get_defined_names());
let entry = Arc::new(SheetCacheEntry {
metrics,
style_tags,
named_ranges,
detected_regions: RwLock::new(None),
region_notes: RwLock::new(Vec::new()),
});
writer.insert(sheet_name.to_string(), entry.clone());
Ok(entry)
}
pub fn get_sheet_metrics(&self, sheet_name: &str) -> Result<Arc<SheetCacheEntry>> {
let entry = self.get_sheet_metrics_fast(sheet_name)?;
if entry.has_detected_regions() {
return Ok(entry);
}
let book = self.spreadsheet.read();
let sheet = book
.get_sheet_by_name(sheet_name)
.ok_or_else(|| anyhow!("sheet {} not found", sheet_name))?;
let detected = detect_regions(sheet, &entry.metrics);
entry.set_detected_regions(detected.regions);
entry.set_region_notes(detected.notes);
Ok(entry)
}
pub fn list_summaries(&self, include_bounds: bool) -> Result<Vec<SheetSummary>> {
let book = self.spreadsheet.read();
let mut summaries = Vec::new();
for sheet in book.get_sheet_collection() {
let name = sheet.get_name().to_string();
let entry = self.get_sheet_metrics_fast(&name)?;
summaries.push(SheetSummary {
name: name.clone(),
visible: sheet.get_sheet_state() != "hidden",
row_count: include_bounds.then_some(entry.metrics.row_count),
column_count: include_bounds.then_some(entry.metrics.column_count),
non_empty_cells: include_bounds.then_some(entry.metrics.non_empty_cells),
formula_cells: include_bounds.then_some(entry.metrics.formula_cells),
cached_values: include_bounds.then_some(entry.metrics.cached_values),
classification: entry.metrics.classification.clone(),
style_tags: if include_bounds {
entry.style_tags.clone()
} else {
Vec::new()
},
});
}
Ok(summaries)
}
pub fn with_sheet<T, F>(&self, sheet_name: &str, func: F) -> Result<T>
where
F: FnOnce(&Worksheet) -> T,
{
let book = self.spreadsheet.read();
let sheet = book
.get_sheet_by_name(sheet_name)
.ok_or_else(|| anyhow!("sheet {} not found", sheet_name))?;
Ok(func(sheet))
}
pub fn with_spreadsheet<T, F>(&self, func: F) -> Result<T>
where
F: FnOnce(&Spreadsheet) -> T,
{
let book = self.spreadsheet.read();
Ok(func(&book))
}
pub fn formula_graph(&self, sheet_name: &str) -> Result<FormulaGraph> {
self.with_sheet(sheet_name, |sheet| {
FormulaGraph::build(sheet, &self.formula_atlas)
})?
}
pub fn named_items(&self) -> Result<Vec<NamedRangeDescriptor>> {
let book = self.spreadsheet.read();
let sheet_names: Vec<String> = book
.get_sheet_collection()
.iter()
.map(|sheet| sheet.get_name().to_string())
.collect();
let mut items = Vec::new();
for defined in book.get_defined_names() {
let refers_to = defined.get_address();
let scope = if defined.has_local_sheet_id() {
let idx = *defined.get_local_sheet_id() as usize;
sheet_names.get(idx).cloned()
} else {
None
};
let kind = if refers_to.starts_with('=') {
NamedItemKind::Formula
} else {
NamedItemKind::NamedRange
};
items.push(NamedRangeDescriptor {
name: defined.get_name().to_string(),
scope: scope.clone(),
refers_to: refers_to.clone(),
kind,
sheet_name: scope,
comment: None,
});
}
for sheet in book.get_sheet_collection() {
for table in sheet.get_tables() {
let start = table.get_area().0.get_coordinate();
let end = table.get_area().1.get_coordinate();
items.push(NamedRangeDescriptor {
name: table.get_name().to_string(),
scope: Some(sheet.get_name().to_string()),
refers_to: format!("{}:{}", start, end),
kind: NamedItemKind::Table,
sheet_name: Some(sheet.get_name().to_string()),
comment: None,
});
}
}
Ok(items)
}
pub fn sheet_overview(&self, sheet_name: &str) -> Result<SheetOverviewResponse> {
let entry = self.get_sheet_metrics(sheet_name)?;
let narrative = classification::narrative(&entry.metrics);
let regions = classification::regions(&entry.metrics);
let key_ranges = classification::key_ranges(&entry.metrics);
let detected_regions = entry.detected_regions();
Ok(SheetOverviewResponse {
workbook_id: self.id.clone(),
workbook_short_id: self.short_id.clone(),
sheet_name: sheet_name.to_string(),
narrative,
regions,
detected_regions: detected_regions.clone(),
detected_region_count: detected_regions.len() as u32,
detected_regions_truncated: false,
key_ranges,
formula_ratio: if entry.metrics.non_empty_cells == 0 {
0.0
} else {
entry.metrics.formula_cells as f32 / entry.metrics.non_empty_cells as f32
},
notable_features: entry.style_tags.clone(),
notes: entry.region_notes(),
})
}
pub fn detected_region(
&self,
sheet_name: &str,
id: u32,
) -> Result<crate::model::DetectedRegion> {
let entry = self.get_sheet_metrics(sheet_name)?;
entry
.detected_regions()
.iter()
.find(|r| r.id == id)
.cloned()
.ok_or_else(|| anyhow!("region {} not found on sheet {}", id, sheet_name))
}
}
fn contains_date_time_token(format_code: &str) -> bool {
let mut in_quote = false;
let mut in_bracket = false;
let chars: Vec<char> = format_code.chars().collect();
for (i, &ch) in chars.iter().enumerate() {
match ch {
'"' => in_quote = !in_quote,
'[' if !in_quote => in_bracket = true,
']' if !in_quote => in_bracket = false,
'y' | 'd' | 'h' | 's' | 'm' if !in_quote && !in_bracket => {
if ch == 'm' {
let prev = if i > 0 { chars.get(i - 1) } else { None };
let next = chars.get(i + 1);
let after_time_sep = prev == Some(&':') || prev == Some(&'h');
let before_time_sep = next == Some(&':') || next == Some(&'s');
if after_time_sep || before_time_sep {
return true;
}
if prev == Some(&'m') || next == Some(&'m') {
return true;
}
if matches!(prev, Some(&'/') | Some(&'-') | Some(&'.'))
|| matches!(next, Some(&'/') | Some(&'-') | Some(&'.'))
{
return true;
}
} else {
return true;
}
}
_ => {}
}
}
false
}
const DATE_FORMAT_IDS: &[u32] = &[
14, 15, 16, 17, 18, 19, 20, 21, 22, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 45, 46, 47, 50, 51,
52, 53, 54, 55, 56, 57, 58,
];
const EXCEL_LEAP_YEAR_BUG_SERIAL: i64 = 60;
fn is_date_formatted(cell: &umya_spreadsheet::Cell) -> bool {
let Some(nf) = cell.get_style().get_number_format() else {
return false;
};
let format_id = nf.get_number_format_id();
if DATE_FORMAT_IDS.contains(format_id) {
return true;
}
let code = nf.get_format_code();
if code == "General" || code == "@" || code == "0" || code == "0.00" {
return false;
}
contains_date_time_token(code)
}
pub fn excel_serial_to_iso(serial: f64, use_1904_system: bool) -> String {
excel_serial_to_iso_with_leap_bug(serial, use_1904_system, true)
}
pub fn excel_serial_to_iso_with_leap_bug(
serial: f64,
use_1904_system: bool,
compensate_leap_bug: bool,
) -> String {
use chrono::NaiveDate;
let days = serial.trunc() as i64;
if use_1904_system {
let epoch_1904 = NaiveDate::from_ymd_opt(1904, 1, 1).unwrap();
return epoch_1904
.checked_add_signed(chrono::Duration::days(days))
.map(|d| d.format("%Y-%m-%d").to_string())
.unwrap_or_else(|| serial.to_string());
}
let epoch = if compensate_leap_bug && days >= EXCEL_LEAP_YEAR_BUG_SERIAL {
NaiveDate::from_ymd_opt(1899, 12, 30).unwrap()
} else {
NaiveDate::from_ymd_opt(1899, 12, 31).unwrap()
};
epoch
.checked_add_signed(chrono::Duration::days(days))
.map(|d| d.format("%Y-%m-%d").to_string())
.unwrap_or_else(|| serial.to_string())
}
pub fn cell_to_value(cell: &umya_spreadsheet::Cell) -> Option<crate::model::CellValue> {
cell_to_value_with_date_system(cell, false)
}
pub fn cell_to_value_with_date_system(
cell: &umya_spreadsheet::Cell,
use_1904_system: bool,
) -> Option<crate::model::CellValue> {
let raw = cell.get_value();
if raw.is_empty() {
return None;
}
if let Ok(number) = raw.parse::<f64>() {
if is_date_formatted(cell) {
return Some(crate::model::CellValue::Date(excel_serial_to_iso(
number,
use_1904_system,
)));
}
return Some(crate::model::CellValue::Number(number));
}
let lower = raw.to_ascii_lowercase();
if lower == "true" {
return Some(crate::model::CellValue::Bool(true));
}
if lower == "false" {
return Some(crate::model::CellValue::Bool(false));
}
Some(crate::model::CellValue::Text(raw.to_string()))
}
pub fn compute_sheet_metrics(sheet: &Worksheet) -> (SheetMetrics, Vec<String>) {
use std::collections::HashMap as StdHashMap;
let mut non_empty = 0u32;
let mut formulas = 0u32;
let mut cached = 0u32;
let comments = sheet.get_comments().len() as u32;
let mut style_usage: StdHashMap<String, StyleUsage> = StdHashMap::new();
for cell in sheet.get_cell_collection() {
let value = cell.get_value();
if !value.is_empty() {
non_empty += 1;
}
if cell.is_formula() {
formulas += 1;
if !cell.get_value().is_empty() {
cached += 1;
}
}
if let Some((style_key, usage)) = style::tag_cell(cell) {
let entry = style_usage.entry(style_key).or_insert_with(|| StyleUsage {
occurrences: 0,
tags: usage.tags.clone(),
example_cells: Vec::new(),
});
entry.occurrences += 1;
if entry.example_cells.len() < 5 {
entry.example_cells.push(usage.example_cell.clone());
}
}
}
let (max_col, max_row) = sheet.get_highest_column_and_row();
let classification = classification::classify(
non_empty,
formulas,
max_row,
max_col,
comments,
&style_usage,
);
let style_tags: Vec<String> = style_usage
.values()
.flat_map(|usage| usage.tags.clone())
.collect();
let metrics = SheetMetrics {
row_count: max_row,
column_count: max_col,
non_empty_cells: non_empty,
formula_cells: formulas,
cached_values: cached,
comments,
style_map: style_usage,
classification,
};
(metrics, style_tags)
}
#[derive(Debug, Clone, Copy)]
struct Rect {
start_row: u32,
end_row: u32,
start_col: u32,
end_col: u32,
}
#[derive(Debug, Clone)]
struct CellInfo {
value: Option<crate::model::CellValue>,
is_formula: bool,
}
#[derive(Debug)]
struct Occupancy {
cells: HashMap<(u32, u32), CellInfo>,
rows: HashMap<u32, Vec<u32>>,
cols: HashMap<u32, Vec<u32>>,
min_row: u32,
max_row: u32,
min_col: u32,
max_col: u32,
}
impl Occupancy {
fn bounds_rect(&self) -> Option<Rect> {
if self.cells.is_empty() {
None
} else {
Some(Rect {
start_row: self.min_row,
end_row: self.max_row,
start_col: self.min_col,
end_col: self.max_col,
})
}
}
fn dense_bounds(&self) -> Option<Rect> {
let bounds = self.bounds_rect()?;
let total_cells = self.cells.len();
if total_cells < DETECT_OUTLIER_MIN_CELLS {
return Some(bounds);
}
let trim_cells = ((total_cells as f32) * DETECT_OUTLIER_FRACTION).round() as usize;
if trim_cells == 0 || trim_cells * 2 >= total_cells {
return Some(bounds);
}
let mut row_counts: Vec<(u32, usize)> = self
.rows
.iter()
.map(|(row, cols)| (*row, cols.len()))
.collect();
row_counts.sort_by_key(|(row, _)| *row);
let mut col_counts: Vec<(u32, usize)> = self
.cols
.iter()
.map(|(col, rows)| (*col, rows.len()))
.collect();
col_counts.sort_by_key(|(col, _)| *col);
let (start_row, end_row) =
trim_bounds_by_cells(&row_counts, trim_cells, bounds.start_row, bounds.end_row);
let (start_col, end_col) =
trim_bounds_by_cells(&col_counts, trim_cells, bounds.start_col, bounds.end_col);
if start_row > end_row || start_col > end_col {
return Some(bounds);
}
Some(Rect {
start_row,
end_row,
start_col,
end_col,
})
}
fn row_col_counts(&self, rect: &Rect) -> (Vec<u32>, Vec<u32>) {
let height = (rect.end_row - rect.start_row + 1) as usize;
let width = (rect.end_col - rect.start_col + 1) as usize;
let mut row_counts = vec![0u32; height];
let mut col_counts = vec![0u32; width];
for (row, cols) in &self.rows {
if *row < rect.start_row || *row > rect.end_row {
continue;
}
let count = count_in_sorted_range(cols, rect.start_col, rect.end_col);
row_counts[(row - rect.start_row) as usize] = count;
}
for (col, rows) in &self.cols {
if *col < rect.start_col || *col > rect.end_col {
continue;
}
let count = count_in_sorted_range(rows, rect.start_row, rect.end_row);
col_counts[(col - rect.start_col) as usize] = count;
}
(row_counts, col_counts)
}
fn stats_in_rect(&self, rect: &Rect) -> RegionStats {
let mut stats = RegionStats::default();
for (row, cols) in &self.rows {
if *row < rect.start_row || *row > rect.end_row {
continue;
}
let start_idx = lower_bound(cols, rect.start_col);
let end_idx = upper_bound(cols, rect.end_col);
for col in &cols[start_idx..end_idx] {
if let Some(info) = self.cells.get(&(*row, *col)) {
stats.non_empty += 1;
if info.is_formula {
stats.formulas += 1;
}
if let Some(val) = &info.value {
match val {
crate::model::CellValue::Text(_) => stats.text += 1,
crate::model::CellValue::Number(_) => stats.numbers += 1,
crate::model::CellValue::Bool(_) => stats.bools += 1,
crate::model::CellValue::Date(_) => stats.dates += 1,
crate::model::CellValue::Error(_) => stats.errors += 1,
}
}
}
}
}
stats
}
fn value_at(&self, row: u32, col: u32) -> Option<&crate::model::CellValue> {
self.cells.get(&(row, col)).and_then(|c| c.value.as_ref())
}
}
fn lower_bound(values: &[u32], target: u32) -> usize {
let mut left = 0;
let mut right = values.len();
while left < right {
let mid = (left + right) / 2;
if values[mid] < target {
left = mid + 1;
} else {
right = mid;
}
}
left
}
fn upper_bound(values: &[u32], target: u32) -> usize {
let mut left = 0;
let mut right = values.len();
while left < right {
let mid = (left + right) / 2;
if values[mid] <= target {
left = mid + 1;
} else {
right = mid;
}
}
left
}
fn count_in_sorted_range(values: &[u32], start: u32, end: u32) -> u32 {
if values.is_empty() {
return 0;
}
let start_idx = lower_bound(values, start);
let end_idx = upper_bound(values, end);
end_idx.saturating_sub(start_idx) as u32
}
fn trim_bounds_by_cells(
entries: &[(u32, usize)],
trim_cells: usize,
default_start: u32,
default_end: u32,
) -> (u32, u32) {
if entries.is_empty() {
return (default_start, default_end);
}
let mut remaining = trim_cells;
let mut start_idx = 0usize;
while start_idx < entries.len() {
let count = entries[start_idx].1;
if remaining < count {
break;
}
remaining -= count;
start_idx += 1;
}
let mut remaining = trim_cells;
let mut end_idx = entries.len();
while end_idx > 0 {
let count = entries[end_idx - 1].1;
if remaining < count {
break;
}
remaining -= count;
end_idx -= 1;
}
let start = entries
.get(start_idx)
.map(|(idx, _)| *idx)
.unwrap_or(default_start);
let end = if end_idx == 0 {
default_end
} else {
entries
.get(end_idx - 1)
.map(|(idx, _)| *idx)
.unwrap_or(default_end)
};
(start, end)
}
#[derive(Debug, Default, Clone)]
struct RegionStats {
non_empty: u32,
formulas: u32,
text: u32,
numbers: u32,
bools: u32,
dates: u32,
errors: u32,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Gutter {
Row { start: u32, end: u32 },
Col { start: u32, end: u32 },
}
#[derive(Debug, Default)]
struct DetectRegionsResult {
regions: Vec<crate::model::DetectedRegion>,
notes: Vec<String>,
}
#[derive(Debug)]
struct DetectLimits {
start: Instant,
max_ms: u64,
max_leaves: usize,
max_depth: u32,
leaves: usize,
exceeded_time: bool,
exceeded_leaves: bool,
}
impl DetectLimits {
fn new() -> Self {
Self {
start: Instant::now(),
max_ms: DETECT_MAX_MS,
max_leaves: DETECT_MAX_LEAVES,
max_depth: DETECT_MAX_DEPTH,
leaves: 0,
exceeded_time: false,
exceeded_leaves: false,
}
}
fn should_stop(&mut self) -> bool {
if !self.exceeded_time && self.start.elapsed().as_millis() as u64 >= self.max_ms {
self.exceeded_time = true;
}
self.exceeded_time || self.exceeded_leaves
}
fn note_leaf(&mut self) {
self.leaves += 1;
if self.leaves >= self.max_leaves {
self.exceeded_leaves = true;
}
}
}
fn detect_regions(sheet: &Worksheet, metrics: &SheetMetrics) -> DetectRegionsResult {
if metrics.row_count == 0 || metrics.column_count == 0 {
return DetectRegionsResult::default();
}
let occupancy = build_occupancy(sheet);
if occupancy.cells.is_empty() {
return DetectRegionsResult::default();
}
let area = (metrics.row_count as u64) * (metrics.column_count as u64);
let exceeds_caps = metrics.row_count > DETECT_MAX_ROWS
|| metrics.column_count > DETECT_MAX_COLS
|| area > DETECT_MAX_AREA
|| occupancy.cells.len() > DETECT_MAX_CELLS;
if exceeds_caps {
let mut result = DetectRegionsResult::default();
if let Some(bounds) = occupancy.dense_bounds() {
result.regions.push(build_fallback_region(&bounds, metrics));
}
result.notes.push(format!(
"Region detection capped: rows {}, cols {}, occupied {}.",
metrics.row_count,
metrics.column_count,
occupancy.cells.len()
));
return result;
}
let root = occupancy.bounds_rect().unwrap_or(Rect {
start_row: 1,
end_row: metrics.row_count.max(1),
start_col: 1,
end_col: metrics.column_count.max(1),
});
let mut leaves = Vec::new();
let mut limits = DetectLimits::new();
split_rect(&occupancy, &root, 0, &mut limits, &mut leaves);
let mut regions = Vec::new();
for (idx, rect) in leaves.into_iter().enumerate() {
if limits.should_stop() {
break;
}
if let Some(trimmed) = trim_rect(&occupancy, rect, &mut limits) {
let region = build_region(&occupancy, &trimmed, metrics, idx as u32);
regions.push(region);
}
}
let mut notes = Vec::new();
if limits.exceeded_time || limits.exceeded_leaves {
notes.push("Region detection truncated due to time/complexity caps.".to_string());
}
if regions.is_empty()
&& let Some(bounds) = occupancy.dense_bounds()
{
regions.push(build_fallback_region(&bounds, metrics));
notes.push("Region detection returned no regions; fallback bounds used.".to_string());
}
DetectRegionsResult { regions, notes }
}
fn build_fallback_region(rect: &Rect, metrics: &SheetMetrics) -> crate::model::DetectedRegion {
let kind = match metrics.classification {
SheetClassification::Calculator => crate::model::RegionKind::Calculator,
SheetClassification::Metadata => crate::model::RegionKind::Metadata,
_ => crate::model::RegionKind::Data,
};
let end_col = crate::utils::column_number_to_name(rect.end_col.max(1));
let end_cell = format!("{}{}", end_col, rect.end_row.max(1));
let header_count = rect.end_col - rect.start_col + 1;
crate::model::DetectedRegion {
id: 0,
bounds: format!(
"{}{}:{}",
crate::utils::column_number_to_name(rect.start_col),
rect.start_row,
end_cell
),
header_row: None,
headers: Vec::new(),
header_count,
headers_truncated: header_count > 0,
row_count: rect.end_row - rect.start_row + 1,
classification: kind.clone(),
region_kind: Some(kind),
confidence: 0.2,
}
}
fn build_occupancy(sheet: &Worksheet) -> Occupancy {
let mut cells = HashMap::new();
let mut rows: HashMap<u32, Vec<u32>> = HashMap::new();
let mut cols: HashMap<u32, Vec<u32>> = HashMap::new();
let mut min_row = u32::MAX;
let mut max_row = 0u32;
let mut min_col = u32::MAX;
let mut max_col = 0u32;
for cell in sheet.get_cell_collection() {
let coord = cell.get_coordinate();
let row = *coord.get_row_num();
let col = *coord.get_col_num();
let value = cell_to_value(cell);
let is_formula = cell.is_formula();
cells.insert((row, col), CellInfo { value, is_formula });
rows.entry(row).or_default().push(col);
cols.entry(col).or_default().push(row);
min_row = min_row.min(row);
max_row = max_row.max(row);
min_col = min_col.min(col);
max_col = max_col.max(col);
}
for cols in rows.values_mut() {
cols.sort_unstable();
}
for rows in cols.values_mut() {
rows.sort_unstable();
}
if cells.is_empty() {
min_row = 0;
min_col = 0;
}
Occupancy {
cells,
rows,
cols,
min_row,
max_row,
min_col,
max_col,
}
}
fn split_rect(
occupancy: &Occupancy,
rect: &Rect,
depth: u32,
limits: &mut DetectLimits,
leaves: &mut Vec<Rect>,
) {
if limits.should_stop() || depth >= limits.max_depth {
limits.note_leaf();
leaves.push(*rect);
return;
}
if rect.start_row >= rect.end_row && rect.start_col >= rect.end_col {
limits.note_leaf();
leaves.push(*rect);
return;
}
if let Some(gutter) = find_best_gutter(occupancy, rect, limits) {
match gutter {
Gutter::Row { start, end } => {
if start > rect.start_row {
let upper = Rect {
start_row: rect.start_row,
end_row: start - 1,
start_col: rect.start_col,
end_col: rect.end_col,
};
split_rect(occupancy, &upper, depth + 1, limits, leaves);
}
if end < rect.end_row {
let lower = Rect {
start_row: end + 1,
end_row: rect.end_row,
start_col: rect.start_col,
end_col: rect.end_col,
};
split_rect(occupancy, &lower, depth + 1, limits, leaves);
}
}
Gutter::Col { start, end } => {
if start > rect.start_col {
let left = Rect {
start_row: rect.start_row,
end_row: rect.end_row,
start_col: rect.start_col,
end_col: start - 1,
};
split_rect(occupancy, &left, depth + 1, limits, leaves);
}
if end < rect.end_col {
let right = Rect {
start_row: rect.start_row,
end_row: rect.end_row,
start_col: end + 1,
end_col: rect.end_col,
};
split_rect(occupancy, &right, depth + 1, limits, leaves);
}
}
}
return;
}
limits.note_leaf();
leaves.push(*rect);
}
fn find_best_gutter(
occupancy: &Occupancy,
rect: &Rect,
limits: &mut DetectLimits,
) -> Option<Gutter> {
if limits.should_stop() {
return None;
}
let (row_counts, col_counts) = occupancy.row_col_counts(rect);
let width = rect.end_col - rect.start_col + 1;
let height = rect.end_row - rect.start_row + 1;
let row_blank_runs = find_blank_runs(&row_counts, width);
let col_blank_runs = find_blank_runs(&col_counts, height);
let mut best: Option<(Gutter, u32)> = None;
if let Some((start, end, len)) = row_blank_runs {
let gutter = Gutter::Row {
start: rect.start_row + start,
end: rect.start_row + end,
};
best = Some((gutter, len));
}
if let Some((start, end, len)) = col_blank_runs {
let gutter = Gutter::Col {
start: rect.start_col + start,
end: rect.start_col + end,
};
if best.map(|(_, l)| len > l).unwrap_or(true) {
best = Some((gutter, len));
}
}
best.map(|(g, _)| g)
}
fn find_blank_runs(counts: &[u32], span: u32) -> Option<(u32, u32, u32)> {
if counts.is_empty() {
return None;
}
let mut best_start = 0;
let mut best_end = 0;
let mut best_len = 0;
let mut current_start = None;
for (idx, count) in counts.iter().enumerate() {
let is_blank = *count == 0 || (*count as f32 / span as f32) < 0.05;
if is_blank {
if current_start.is_none() {
current_start = Some(idx as u32);
}
} else if let Some(start) = current_start.take() {
let end = idx as u32 - 1;
let len = end - start + 1;
if len > best_len && start > 0 && end + 1 < counts.len() as u32 {
best_len = len;
best_start = start;
best_end = end;
}
}
}
if let Some(start) = current_start {
let end = counts.len() as u32 - 1;
let len = end - start + 1;
if len > best_len && start > 0 && end + 1 < counts.len() as u32 {
best_len = len;
best_start = start;
best_end = end;
}
}
if best_len >= 2 {
Some((best_start, best_end, best_len))
} else {
None
}
}
fn trim_rect(occupancy: &Occupancy, rect: Rect, limits: &mut DetectLimits) -> Option<Rect> {
let mut r = rect;
loop {
if limits.should_stop() {
return Some(r);
}
let (row_counts, col_counts) = occupancy.row_col_counts(&r);
let width = r.end_col - r.start_col + 1;
let height = r.end_row - r.start_row + 1;
let top_blank = row_counts
.first()
.map(|c| *c == 0 || (*c as f32 / width as f32) < 0.1)
.unwrap_or(false);
let bottom_blank = row_counts
.last()
.map(|c| *c == 0 || (*c as f32 / width as f32) < 0.1)
.unwrap_or(false);
let left_blank = col_counts
.first()
.map(|c| *c == 0 || (*c as f32 / height as f32) < 0.1)
.unwrap_or(false);
let right_blank = col_counts
.last()
.map(|c| *c == 0 || (*c as f32 / height as f32) < 0.1)
.unwrap_or(false);
let mut changed = false;
if top_blank && r.start_row < r.end_row {
r.start_row += 1;
changed = true;
}
if bottom_blank && r.end_row > r.start_row {
r.end_row -= 1;
changed = true;
}
if left_blank && r.start_col < r.end_col {
r.start_col += 1;
changed = true;
}
if right_blank && r.end_col > r.start_col {
r.end_col -= 1;
changed = true;
}
if !changed {
break;
}
if r.start_row > r.end_row || r.start_col > r.end_col {
return None;
}
}
Some(r)
}
fn build_region(
occupancy: &Occupancy,
rect: &Rect,
metrics: &SheetMetrics,
id: u32,
) -> crate::model::DetectedRegion {
let header_info = detect_headers(occupancy, rect);
let stats = occupancy.stats_in_rect(rect);
let (kind, confidence) = classify_region(rect, &stats, &header_info, metrics);
let header_len = header_info.headers.len() as u32;
let header_count = rect.end_col - rect.start_col + 1;
let headers_truncated = header_len != header_count;
crate::model::DetectedRegion {
id,
bounds: format!(
"{}{}:{}{}",
crate::utils::column_number_to_name(rect.start_col),
rect.start_row,
crate::utils::column_number_to_name(rect.end_col),
rect.end_row
),
header_row: header_info.header_row,
headers: header_info.headers,
header_count,
headers_truncated,
row_count: rect.end_row - rect.start_row + 1,
classification: kind.clone(),
region_kind: Some(kind),
confidence,
}
}
#[derive(Debug, Default)]
struct HeaderInfo {
header_row: Option<u32>,
headers: Vec<String>,
is_key_value: bool,
}
fn is_key_value_layout(occupancy: &Occupancy, rect: &Rect) -> bool {
let width = rect.end_col - rect.start_col + 1;
if width == 2 {
return check_key_value_columns(occupancy, rect, rect.start_col, rect.start_col + 1);
}
if width <= KV_MAX_WIDTH_FOR_DENSITY_CHECK {
let rows_to_sample = (rect.end_row - rect.start_row + 1).min(KV_SAMPLE_ROWS);
let density_threshold = (rows_to_sample as f32 * KV_DENSITY_THRESHOLD) as u32;
let mut col_densities: Vec<(u32, u32)> = Vec::new();
for col in rect.start_col..=rect.end_col {
let count = (rect.start_row..rect.start_row + rows_to_sample)
.filter(|&row| occupancy.value_at(row, col).is_some())
.count() as u32;
if count >= density_threshold {
col_densities.push((col, count));
}
}
if col_densities.len() == 2 {
let label_col = col_densities[0].0;
let value_col = col_densities[1].0;
return check_key_value_columns(occupancy, rect, label_col, value_col);
} else if col_densities.len() == 4 && width >= 4 {
let pair1 =
check_key_value_columns(occupancy, rect, col_densities[0].0, col_densities[1].0);
let pair2 =
check_key_value_columns(occupancy, rect, col_densities[2].0, col_densities[3].0);
return pair1 && pair2;
}
}
false
}
fn check_key_value_columns(
occupancy: &Occupancy,
rect: &Rect,
label_col: u32,
value_col: u32,
) -> bool {
let mut label_value_pairs = 0u32;
let rows_to_check = (rect.end_row - rect.start_row + 1).min(KV_CHECK_ROWS);
for row in rect.start_row..rect.start_row + rows_to_check {
let first_col = occupancy.value_at(row, label_col);
let second_col = occupancy.value_at(row, value_col);
if let (Some(crate::model::CellValue::Text(label)), Some(val)) = (first_col, second_col) {
let label_looks_like_key = label.len() <= KV_MAX_LABEL_LEN
&& !label.chars().any(|c| c.is_ascii_digit())
&& label.contains(|c: char| c.is_alphabetic());
let value_is_data = matches!(
val,
crate::model::CellValue::Number(_) | crate::model::CellValue::Date(_)
) || matches!(val, crate::model::CellValue::Text(s) if s.len() > KV_MIN_TEXT_VALUE_LEN);
if label_looks_like_key && value_is_data {
label_value_pairs += 1;
}
}
}
label_value_pairs >= KV_MIN_PAIRS
&& label_value_pairs as f32 / rows_to_check as f32 >= KV_MIN_PAIR_RATIO
}
fn header_data_penalty(s: &str) -> f32 {
if s.is_empty() {
return 0.0;
}
if s.len() > HEADER_LONG_STRING_PENALTY_THRESHOLD {
return HEADER_LONG_STRING_PENALTY;
}
let first_char = s.chars().next().unwrap();
let is_capitalized = first_char.is_uppercase();
let has_lowercase = s.chars().skip(1).any(|c| c.is_lowercase());
let is_all_caps = s.chars().all(|c| !c.is_alphabetic() || c.is_uppercase());
let has_digits = s.chars().any(|c| c.is_ascii_digit());
let is_proper_noun =
is_capitalized && has_lowercase && !is_all_caps && s.len() > HEADER_PROPER_NOUN_MIN_LEN;
let mut penalty = 0.0;
if is_proper_noun {
penalty += HEADER_PROPER_NOUN_PENALTY;
}
if has_digits && s.len() > HEADER_DIGIT_STRING_MIN_LEN {
penalty += HEADER_DIGIT_STRING_PENALTY;
}
penalty
}
fn detect_headers(occupancy: &Occupancy, rect: &Rect) -> HeaderInfo {
if is_key_value_layout(occupancy, rect) {
let mut headers = Vec::new();
for col in rect.start_col..=rect.end_col {
headers.push(crate::utils::column_number_to_name(col));
}
return HeaderInfo {
header_row: None,
headers,
is_key_value: true,
};
}
let width = rect.end_col - rect.start_col + 1;
if width > HEADER_MAX_COLUMNS {
return HeaderInfo {
header_row: None,
headers: Vec::new(),
is_key_value: false,
};
}
let mut candidates = Vec::new();
let max_row = rect
.start_row
.saturating_add(HEADER_MAX_SCAN_ROWS)
.min(rect.end_row);
for row in rect.start_row..=max_row {
let mut text = 0;
let mut numbers = 0;
let mut non_empty = 0;
let mut unique = HashSet::new();
let mut data_like_penalty: f32 = 0.0;
let mut year_like_bonus: f32 = 0.0;
for col in rect.start_col..=rect.end_col {
if let Some(val) = occupancy.value_at(row, col) {
non_empty += 1;
match val {
crate::model::CellValue::Text(s) => {
text += 1;
unique.insert(s.clone());
data_like_penalty += header_data_penalty(s);
}
crate::model::CellValue::Number(n) => {
if *n >= HEADER_YEAR_MIN && *n <= HEADER_YEAR_MAX && n.fract() == 0.0 {
year_like_bonus += HEADER_YEAR_LIKE_BONUS;
text += 1;
} else {
numbers += 1;
}
}
crate::model::CellValue::Bool(_) => text += 1,
crate::model::CellValue::Date(_) => {
data_like_penalty += HEADER_DATE_PENALTY;
}
crate::model::CellValue::Error(_) => {}
}
}
}
if non_empty == 0 {
continue;
}
let score = text as f32 + unique.len() as f32 * HEADER_UNIQUE_BONUS
- numbers as f32 * HEADER_NUMBER_PENALTY
- data_like_penalty
+ year_like_bonus;
candidates.push((row, score, text, non_empty));
}
let is_single_col = rect.start_col == rect.end_col;
let header_candidates: Vec<&(u32, f32, u32, u32)> = candidates
.iter()
.filter(|(_, score, text, non_empty)| {
*text >= 1
&& *text * 2 >= *non_empty
&& (!is_single_col || *score > HEADER_SINGLE_COL_MIN_SCORE)
})
.collect();
let best = header_candidates.iter().copied().max_by(|a, b| {
a.1.partial_cmp(&b.1)
.unwrap_or(Ordering::Equal)
.then_with(|| b.0.cmp(&a.0))
});
let earliest = header_candidates
.iter()
.copied()
.min_by(|a, b| a.0.cmp(&b.0));
let maybe_header = match (best, earliest) {
(Some(best_row), Some(early_row)) => {
if (best_row.1 - early_row.1).abs() <= HEADER_SCORE_TIE_THRESHOLD {
Some(early_row.0)
} else {
Some(best_row.0)
}
}
(Some(best_row), None) => Some(best_row.0),
_ => None,
};
let mut header_rows = Vec::new();
if let Some(hr) = maybe_header {
header_rows.push(hr);
if hr < rect.end_row
&& let Some((_, score_next, text_next, non_empty_next)) =
candidates.iter().find(|(r, _, _, _)| *r == hr + 1)
&& *text_next >= 1
&& *text_next * 2 >= *non_empty_next
&& *score_next
>= HEADER_SECOND_ROW_MIN_SCORE_RATIO
* candidates
.iter()
.find(|(r, _, _, _)| *r == hr)
.map(|c| c.1)
.unwrap_or(0.0)
{
header_rows.push(hr + 1);
}
}
let mut headers = Vec::new();
for col in rect.start_col..=rect.end_col {
let mut parts = Vec::new();
for hr in &header_rows {
if let Some(val) = occupancy.value_at(*hr, col) {
match val {
crate::model::CellValue::Text(s) if !s.trim().is_empty() => {
parts.push(s.trim().to_string())
}
crate::model::CellValue::Number(n) => parts.push(n.to_string()),
crate::model::CellValue::Bool(b) => parts.push(b.to_string()),
crate::model::CellValue::Date(d) => parts.push(d.clone()),
crate::model::CellValue::Error(e) => parts.push(e.clone()),
_ => {}
}
}
}
if parts.is_empty() {
headers.push(crate::utils::column_number_to_name(col));
} else {
headers.push(parts.join(" / "));
}
}
HeaderInfo {
header_row: header_rows.first().copied(),
headers,
is_key_value: false,
}
}
fn classify_region(
rect: &Rect,
stats: &RegionStats,
header_info: &HeaderInfo,
metrics: &SheetMetrics,
) -> (crate::model::RegionKind, f32) {
let width = rect.end_col - rect.start_col + 1;
let height = rect.end_row - rect.start_row + 1;
let area = width.max(1) * height.max(1);
let density = if area == 0 {
0.0
} else {
stats.non_empty as f32 / area as f32
};
let formula_ratio = if stats.non_empty == 0 {
0.0
} else {
stats.formulas as f32 / stats.non_empty as f32
};
let text_ratio = if stats.non_empty == 0 {
0.0
} else {
stats.text as f32 / stats.non_empty as f32
};
let mut kind = crate::model::RegionKind::Data;
if formula_ratio > 0.25 && is_outputs_band(rect, metrics, height, width) {
kind = crate::model::RegionKind::Outputs;
} else if formula_ratio > 0.55 {
kind = crate::model::RegionKind::Calculator;
} else if height <= 3
&& width <= 4
&& text_ratio > 0.5
&& rect.end_row >= metrics.row_count.saturating_sub(3)
{
kind = crate::model::RegionKind::Metadata;
} else if header_info.is_key_value
|| (formula_ratio < 0.25
&& stats.numbers > 0
&& stats.text > 0
&& text_ratio >= 0.3
&& (width <= 2 || (width <= 3 && header_info.header_row.is_none())))
{
kind = crate::model::RegionKind::Parameters;
} else if height <= 4 && width <= 6 && formula_ratio < 0.2 && text_ratio > 0.4 && density < 0.5
{
kind = crate::model::RegionKind::Metadata;
}
let mut confidence: f32 = 0.4;
if header_info.header_row.is_some() {
confidence += 0.2;
}
confidence += (density * 0.2).min(0.2);
confidence += (formula_ratio * 0.2).min(0.2);
if matches!(
kind,
crate::model::RegionKind::Parameters | crate::model::RegionKind::Metadata
) && width <= 4
{
confidence += 0.1;
}
if confidence > 1.0 {
confidence = 1.0;
}
(kind, confidence)
}
fn is_outputs_band(rect: &Rect, metrics: &SheetMetrics, height: u32, width: u32) -> bool {
let near_bottom = rect.end_row >= metrics.row_count.saturating_sub(6);
let near_right = rect.end_col >= metrics.column_count.saturating_sub(3);
let is_shallow = height <= 6;
let is_narrow_at_edge = width <= 6 && near_right;
let not_at_top_left = rect.start_row > 1 || rect.start_col > 1;
let sheet_has_depth = metrics.row_count > 10 || metrics.column_count > 6;
let is_band = (is_shallow && near_bottom) || is_narrow_at_edge;
is_band && not_at_top_left && sheet_has_depth
}
fn gather_named_ranges(
sheet: &Worksheet,
defined_names: &[DefinedName],
) -> Vec<NamedRangeDescriptor> {
let name_str = sheet.get_name();
defined_names
.iter()
.filter(|name| name.get_address().contains(name_str))
.map(|name| NamedRangeDescriptor {
name: name.get_name().to_string(),
scope: if name.has_local_sheet_id() {
Some(name_str.to_string())
} else {
None
},
refers_to: name.get_address(),
kind: NamedItemKind::NamedRange,
sheet_name: Some(name_str.to_string()),
comment: None,
})
.collect()
}
pub fn build_workbook_list(
config: &Arc<ServerConfig>,
filter: &WorkbookFilter,
) -> Result<WorkbookListResponse> {
let mut descriptors = Vec::new();
if let Some(single) = config.single_workbook() {
let metadata = fs::metadata(single)
.with_context(|| format!("unable to read metadata for {:?}", single))?;
let canonical = fs::canonicalize(single).unwrap_or_else(|_| single.to_path_buf());
let id = WorkbookId(hash_path_identity(&canonical));
let slug = single
.file_stem()
.map(|s| s.to_string_lossy().to_string())
.unwrap_or_else(|| "workbook".to_string());
let folder = derive_folder(config, single);
let short_id = make_short_workbook_id(&slug, id.as_str());
let caps = BackendCaps::xlsx();
if filter.matches(&slug, folder.as_deref(), single) {
let relative = single
.strip_prefix(&config.workspace_root)
.unwrap_or(single);
let descriptor = crate::model::WorkbookDescriptor {
workbook_id: id,
short_id,
slug,
folder,
path: Some(path_to_forward_slashes(relative)),
client_path: None,
bytes: metadata.len(),
last_modified: metadata
.modified()
.ok()
.and_then(system_time_to_rfc3339)
.map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
revision_id: Some(hash_file_sha256_hex(single)?),
caps: Some(caps),
};
descriptors.push(descriptor);
}
return Ok(WorkbookListResponse {
workbooks: descriptors,
next_offset: None,
});
}
use walkdir::WalkDir;
for entry in WalkDir::new(&config.workspace_root) {
let entry = entry?;
if !entry.file_type().is_file() {
continue;
}
let path = entry.path();
if !has_supported_extension(&config.supported_extensions, path) {
continue;
}
let metadata = entry.metadata()?;
let canonical = fs::canonicalize(path).unwrap_or_else(|_| path.to_path_buf());
let id = WorkbookId(hash_path_identity(&canonical));
let slug = path
.file_stem()
.map(|s| s.to_string_lossy().to_string())
.unwrap_or_else(|| "workbook".to_string());
let folder = derive_folder(config, path);
let short_id = make_short_workbook_id(&slug, id.as_str());
let caps = BackendCaps::xlsx();
if !filter.matches(&slug, folder.as_deref(), path) {
continue;
}
let relative = path.strip_prefix(&config.workspace_root).unwrap_or(path);
let descriptor = crate::model::WorkbookDescriptor {
workbook_id: id,
short_id,
slug,
folder,
path: Some(path_to_forward_slashes(relative)),
client_path: None,
bytes: metadata.len(),
last_modified: metadata
.modified()
.ok()
.and_then(system_time_to_rfc3339)
.map(|dt| dt.to_rfc3339_opts(chrono::SecondsFormat::Secs, true)),
revision_id: Some(hash_file_sha256_hex(path)?),
caps: Some(caps),
};
descriptors.push(descriptor);
}
descriptors.sort_by(|a, b| a.slug.cmp(&b.slug));
Ok(WorkbookListResponse {
workbooks: descriptors,
next_offset: None,
})
}
fn derive_folder(config: &Arc<ServerConfig>, path: &Path) -> Option<String> {
path.strip_prefix(&config.workspace_root)
.ok()
.and_then(|relative| relative.parent())
.and_then(|parent| parent.file_name())
.map(|os| os.to_string_lossy().to_string())
}
fn has_supported_extension(allowed: &[String], path: &Path) -> bool {
path.extension()
.and_then(|ext| ext.to_str())
.map(|ext| {
let lower = ext.to_ascii_lowercase();
allowed.iter().any(|candidate| candidate == &lower)
})
.unwrap_or(false)
}