Skip to main content

excel_cli/cli/
inspect.rs

1use serde_json::{json, Value};
2use std::collections::HashMap;
3
4use crate::cli::args::InspectCommands;
5use crate::cli::common::{file_format, format_range, sheet_by_index};
6use crate::cli::envelope;
7use crate::cli::error::AppError;
8use crate::cli::sheet_query::{
9    cell_at, load_target_sheet, resolve_bounds, resolve_header_row, resolve_optional_header_row,
10};
11use crate::excel::{open_workbook, Cell, CellType, Sheet};
12use crate::utils::index_to_col_name;
13
14pub fn handle(cmd: InspectCommands) -> Result<Value, AppError> {
15    match cmd {
16        InspectCommands::Workbook { file, format: _ } => inspect_workbook(file),
17        InspectCommands::Sheet {
18            file,
19            sheet,
20            sheet_index,
21            format: _,
22        } => inspect_sheet(file, sheet, sheet_index),
23        InspectCommands::Sample {
24            file,
25            sheet,
26            sheet_index,
27            range,
28            rows,
29            header_row,
30            format: _,
31        } => inspect_sample(file, sheet, sheet_index, range, rows, header_row),
32        InspectCommands::Columns {
33            file,
34            sheet,
35            header_row,
36            format: _,
37        } => inspect_columns(file, sheet, header_row),
38        InspectCommands::Tables {
39            file,
40            sheet,
41            format: _,
42        } => inspect_tables(file, sheet),
43    }
44}
45
46fn inspect_workbook(file: std::path::PathBuf) -> Result<Value, AppError> {
47    let format_str = file_format(&file);
48    let path_str = file.to_string_lossy().to_string();
49
50    let workbook = open_workbook(&file, false).map_err(crate::cli::error::anyhow_to_app_error)?;
51
52    let sheets: Vec<Value> = workbook
53        .get_sheet_names()
54        .iter()
55        .enumerate()
56        .map(|(index, name)| {
57            let is_empty = if let Some(sheet) = workbook.get_sheet_by_index(index) {
58                sheet.max_rows == 0 || sheet.max_cols == 0
59            } else {
60                true
61            };
62            json!({
63                "name": name,
64                "index": index,
65                "is_empty": is_empty,
66                "is_hidden_if_available": false,
67            })
68        })
69        .collect();
70
71    let data = json!({
72        "sheet_count": sheets.len(),
73        "sheets": sheets,
74    });
75
76    Ok(envelope::success_envelope(
77        "inspect.workbook",
78        &path_str,
79        &format_str,
80        envelope::target_workbook(),
81        json!({}),
82        data,
83        vec![],
84    ))
85}
86
87fn inspect_sheet(
88    file: std::path::PathBuf,
89    sheet: Option<String>,
90    sheet_index: Option<usize>,
91) -> Result<Value, AppError> {
92    let format_str = file_format(&file);
93    let path_str = file.to_string_lossy().to_string();
94
95    let mut workbook =
96        open_workbook(&file, false).map_err(crate::cli::error::anyhow_to_app_error)?;
97
98    let resolved_sheet = load_target_sheet(&workbook, &sheet, &sheet_index)?;
99
100    workbook
101        .ensure_sheet_loaded(resolved_sheet.index, &resolved_sheet.name)
102        .map_err(crate::cli::error::anyhow_to_app_error)?;
103
104    let sheet_obj = sheet_by_index(&workbook, resolved_sheet.index, &resolved_sheet.name)?;
105
106    let used_range = workbook
107        .get_used_range(resolved_sheet.index)
108        .map_err(crate::cli::error::anyhow_to_app_error)?;
109
110    let non_empty_rows = workbook
111        .count_non_empty_rows(resolved_sheet.index)
112        .map_err(crate::cli::error::anyhow_to_app_error)?;
113    let non_empty_cols = workbook
114        .count_non_empty_cols(resolved_sheet.index)
115        .map_err(crate::cli::error::anyhow_to_app_error)?;
116
117    let (header_candidates, recommended_header_row) = workbook
118        .find_header_candidates(resolved_sheet.index)
119        .map_err(crate::cli::error::anyhow_to_app_error)?;
120
121    let data = json!({
122        "name": sheet_obj.name,
123        "index": resolved_sheet.index,
124        "used_range": used_range,
125        "max_rows": sheet_obj.max_rows,
126        "max_cols": sheet_obj.max_cols,
127        "non_empty_rows": non_empty_rows,
128        "non_empty_cols": non_empty_cols,
129        "recommended_header_row": recommended_header_row,
130        "header_candidates": header_candidates,
131    });
132
133    Ok(envelope::success_envelope(
134        "inspect.sheet",
135        &path_str,
136        &format_str,
137        envelope::target_sheet(&resolved_sheet.name, resolved_sheet.index),
138        json!({}),
139        data,
140        vec![],
141    ))
142}
143
144fn inspect_sample(
145    file: std::path::PathBuf,
146    sheet: Option<String>,
147    sheet_index: Option<usize>,
148    range: Option<String>,
149    rows: Option<usize>,
150    header_row: String,
151) -> Result<Value, AppError> {
152    let format_str = file_format(&file);
153    let path_str = file.to_string_lossy().to_string();
154
155    let mut workbook =
156        open_workbook(&file, false).map_err(crate::cli::error::anyhow_to_app_error)?;
157
158    let resolved_sheet = load_target_sheet(&workbook, &sheet, &sheet_index)?;
159
160    workbook
161        .ensure_sheet_loaded(resolved_sheet.index, &resolved_sheet.name)
162        .map_err(crate::cli::error::anyhow_to_app_error)?;
163
164    let sheet_obj = sheet_by_index(&workbook, resolved_sheet.index, &resolved_sheet.name)?;
165    let bounds = resolve_bounds(&workbook, sheet_obj, resolved_sheet.index, range.as_deref())?;
166
167    // Apply row limit
168    let row_limit = rows.unwrap_or(10);
169    let sample_end_row = (bounds.start_row + row_limit.saturating_sub(1)).min(bounds.end_row);
170
171    let resolved_header =
172        resolve_optional_header_row(&workbook, sheet_obj, resolved_sheet.index, &header_row)?;
173
174    let sample_mode = if resolved_header.is_some() {
175        "records"
176    } else {
177        "rows"
178    };
179
180    let range_str = format_range(
181        bounds.start_row,
182        bounds.start_col,
183        sample_end_row,
184        bounds.end_col,
185    );
186
187    let data = if let Some(header_row_idx) = resolved_header {
188        // Build records with headers
189        let mut headers = Vec::new();
190        if header_row_idx < sheet_obj.data.len() {
191            for col in bounds.start_col..=bounds.end_col {
192                let val = if col < sheet_obj.data[header_row_idx].len() {
193                    sheet_obj.data[header_row_idx][col].value.clone()
194                } else {
195                    String::new()
196                };
197                headers.push(val);
198            }
199        }
200
201        let mut records = Vec::new();
202        for row in bounds.start_row..=sample_end_row {
203            if row == header_row_idx {
204                continue;
205            }
206            if row >= sheet_obj.data.len() {
207                break;
208            }
209            let mut record = serde_json::Map::new();
210            for (col_idx, col) in (bounds.start_col..=bounds.end_col).enumerate() {
211                let key = headers.get(col_idx).cloned().unwrap_or_default();
212                let key = if key.is_empty() {
213                    format!("col_{}", col_idx + 1)
214                } else {
215                    key
216                };
217                let value = if col < sheet_obj.data[row].len() {
218                    crate::json_export::process_cell_value(&sheet_obj.data[row][col])
219                } else {
220                    Value::Null
221                };
222                record.insert(key, value);
223            }
224            records.push(Value::Object(record));
225        }
226
227        json!({
228            "resolved_header_row": header_row_idx,
229            "sample_mode": sample_mode,
230            "records": records,
231        })
232    } else {
233        // Raw rows
234        let mut row_values = Vec::new();
235        for row in bounds.start_row..=sample_end_row {
236            if row >= sheet_obj.data.len() {
237                break;
238            }
239            let mut cols = Vec::new();
240            for col in bounds.start_col..=bounds.end_col {
241                let value = if col < sheet_obj.data[row].len() {
242                    crate::json_export::process_cell_value(&sheet_obj.data[row][col])
243                } else {
244                    Value::Null
245                };
246                cols.push(value);
247            }
248            row_values.push(Value::Array(cols));
249        }
250
251        json!({
252            "resolved_header_row": Value::Null,
253            "sample_mode": sample_mode,
254            "rows": row_values,
255        })
256    };
257
258    Ok(envelope::success_envelope(
259        "inspect.sample",
260        &path_str,
261        &format_str,
262        envelope::target_range(&resolved_sheet.name, resolved_sheet.index, &range_str),
263        json!({}),
264        data,
265        vec![],
266    ))
267}
268
269fn inspect_columns(
270    file: std::path::PathBuf,
271    sheet: String,
272    header_row: String,
273) -> Result<Value, AppError> {
274    let format_str = file_format(&file);
275    let path_str = file.to_string_lossy().to_string();
276
277    let mut workbook =
278        open_workbook(&file, false).map_err(crate::cli::error::anyhow_to_app_error)?;
279
280    let resolved_sheet = load_target_sheet(&workbook, &Some(sheet), &None)?;
281
282    workbook
283        .ensure_sheet_loaded(resolved_sheet.index, &resolved_sheet.name)
284        .map_err(crate::cli::error::anyhow_to_app_error)?;
285
286    let sheet_obj = sheet_by_index(&workbook, resolved_sheet.index, &resolved_sheet.name)?;
287    let resolved_header =
288        resolve_header_row(&workbook, sheet_obj, resolved_sheet.index, &header_row)?;
289    let sheet_obj = sheet_by_index(&workbook, resolved_sheet.index, &resolved_sheet.name)?;
290
291    let header_names = column_header_names(sheet_obj, resolved_header);
292    let duplicate_flags = duplicate_header_flags(&header_names);
293    let safe_names = stable_safe_names(&header_names);
294    let data_start_row = resolved_header.map_or(1, |row| row.saturating_add(1));
295    let data_row_count = if sheet_obj.max_rows >= data_start_row {
296        sheet_obj.max_rows - data_start_row + 1
297    } else {
298        0
299    };
300
301    let columns: Vec<Value> = (1..=sheet_obj.max_cols)
302        .map(|col| {
303            let stats = analyze_column(sheet_obj, col, data_start_row, data_row_count);
304            json!({
305                "index": col,
306                "name": header_names.get(col - 1).cloned().unwrap_or_default(),
307                "safe_name": safe_names.get(col - 1).cloned().unwrap_or_else(|| {
308                    format!("col_{}", index_to_col_name(col))
309                }),
310                "is_duplicate": duplicate_flags.get(col - 1).copied().unwrap_or(false),
311                "inferred_type": stats.inferred_type,
312                "non_null_ratio": ratio(stats.non_null_count, data_row_count),
313                "formula_ratio": ratio(stats.formula_count, data_row_count),
314                "sample_values": stats.sample_values,
315            })
316        })
317        .collect();
318
319    let mut warnings = Vec::new();
320    if header_row == "auto" && resolved_header.is_none() {
321        warnings.push(json!({
322            "code": "header_not_detected",
323            "message": "No header row was detected; column names are synthetic.",
324        }));
325    }
326
327    Ok(envelope::success_envelope(
328        "inspect.columns",
329        &path_str,
330        &format_str,
331        envelope::target_sheet(&resolved_sheet.name, resolved_sheet.index),
332        json!({
333            "header_row_mode": header_row,
334            "resolved_header_row": resolved_header,
335            "column_count": sheet_obj.max_cols,
336            "data_row_count": data_row_count,
337        }),
338        json!({
339            "columns": columns,
340        }),
341        warnings,
342    ))
343}
344
345fn inspect_tables(file: std::path::PathBuf, sheet: String) -> Result<Value, AppError> {
346    let format_str = file_format(&file);
347    let path_str = file.to_string_lossy().to_string();
348
349    let mut workbook =
350        open_workbook(&file, false).map_err(crate::cli::error::anyhow_to_app_error)?;
351
352    let resolved_sheet = load_target_sheet(&workbook, &Some(sheet), &None)?;
353
354    workbook
355        .ensure_sheet_loaded(resolved_sheet.index, &resolved_sheet.name)
356        .map_err(crate::cli::error::anyhow_to_app_error)?;
357
358    let sheet_obj = sheet_by_index(&workbook, resolved_sheet.index, &resolved_sheet.name)?;
359
360    let candidates = detect_table_candidates(sheet_obj);
361    let candidate_count = candidates.len();
362
363    Ok(envelope::success_envelope(
364        "inspect.tables",
365        &path_str,
366        &format_str,
367        envelope::target_sheet(&resolved_sheet.name, resolved_sheet.index),
368        json!({
369            "candidate_count": candidate_count,
370        }),
371        json!({
372            "candidates": candidates,
373        }),
374        vec![],
375    ))
376}
377
378fn column_header_names(sheet: &Sheet, resolved_header: Option<usize>) -> Vec<String> {
379    (1..=sheet.max_cols)
380        .map(|col| {
381            resolved_header
382                .and_then(|row| cell_at(sheet, row, col))
383                .map(|cell| cell.value.clone())
384                .unwrap_or_default()
385        })
386        .collect()
387}
388
389fn duplicate_header_flags(headers: &[String]) -> Vec<bool> {
390    let mut counts = HashMap::new();
391    for header in headers {
392        let normalized = header.trim();
393        if !normalized.is_empty() {
394            *counts.entry(normalized.to_string()).or_insert(0usize) += 1;
395        }
396    }
397
398    headers
399        .iter()
400        .map(|header| {
401            let normalized = header.trim();
402            !normalized.is_empty() && counts.get(normalized).copied().unwrap_or(0) > 1
403        })
404        .collect()
405}
406
407fn stable_safe_names(headers: &[String]) -> Vec<String> {
408    let mut counts = HashMap::new();
409
410    headers
411        .iter()
412        .enumerate()
413        .map(|(offset, header)| {
414            let col = offset + 1;
415            let base = slugify_header(header, col);
416            let count = counts.entry(base.clone()).or_insert(0usize);
417            *count += 1;
418            if *count == 1 {
419                base
420            } else {
421                format!("{base}_{count}")
422            }
423        })
424        .collect()
425}
426
427fn slugify_header(header: &str, col: usize) -> String {
428    let mut slug = String::new();
429    let mut last_was_separator = false;
430
431    for ch in header.trim().chars() {
432        if ch.is_alphanumeric() {
433            for lower in ch.to_lowercase() {
434                slug.push(lower);
435            }
436            last_was_separator = false;
437        } else if !slug.is_empty() && !last_was_separator {
438            slug.push('_');
439            last_was_separator = true;
440        }
441    }
442
443    while slug.ends_with('_') {
444        slug.pop();
445    }
446
447    if slug.is_empty() {
448        format!("col_{}", index_to_col_name(col))
449    } else {
450        slug
451    }
452}
453
454struct ColumnStats {
455    inferred_type: &'static str,
456    non_null_count: usize,
457    formula_count: usize,
458    sample_values: Vec<Value>,
459}
460
461fn analyze_column(
462    sheet: &Sheet,
463    col: usize,
464    data_start_row: usize,
465    data_row_count: usize,
466) -> ColumnStats {
467    let mut inferred_type = None;
468    let mut is_mixed = false;
469    let mut non_null_count = 0usize;
470    let mut formula_count = 0usize;
471    let mut sample_values = Vec::new();
472
473    if data_row_count > 0 {
474        for row in data_start_row..data_start_row + data_row_count {
475            if let Some(cell) = cell_at(sheet, row, col) {
476                if cell.is_formula || cell.formula.is_some() {
477                    formula_count += 1;
478                }
479
480                if is_non_null(cell) {
481                    non_null_count += 1;
482
483                    if sample_values.len() < 5 {
484                        sample_values.push(crate::json_export::process_cell_value(cell));
485                    }
486
487                    if let Some(cell_type) = inferred_kind(cell) {
488                        match inferred_type {
489                            None => inferred_type = Some(cell_type),
490                            Some(existing) if existing == cell_type => {}
491                            Some(_) => is_mixed = true,
492                        }
493                    }
494                }
495            }
496        }
497    }
498
499    ColumnStats {
500        inferred_type: if is_mixed {
501            "mixed"
502        } else {
503            inferred_type.unwrap_or("string")
504        },
505        non_null_count,
506        formula_count,
507        sample_values,
508    }
509}
510
511fn is_non_null(cell: &Cell) -> bool {
512    !cell.value.is_empty()
513}
514
515fn inferred_kind(cell: &Cell) -> Option<&'static str> {
516    match cell.cell_type {
517        CellType::Text => Some("string"),
518        CellType::Number => Some("number"),
519        CellType::Date => Some("date"),
520        CellType::Boolean => Some("boolean"),
521        CellType::Empty => None,
522    }
523}
524
525fn ratio(numerator: usize, denominator: usize) -> f64 {
526    if denominator == 0 {
527        0.0
528    } else {
529        numerator as f64 / denominator as f64
530    }
531}
532
533#[derive(Clone)]
534struct TableCandidate {
535    start_row: usize,
536    start_col: usize,
537    end_row: usize,
538    end_col: usize,
539    confidence: f64,
540}
541
542fn detect_table_candidates(sheet: &Sheet) -> Vec<Value> {
543    if sheet.max_rows == 0 || sheet.max_cols == 0 {
544        return Vec::new();
545    }
546
547    let mut candidates = Vec::new();
548
549    for row in 1..=sheet.max_rows {
550        let spans = non_empty_spans(sheet, row);
551        for (start_col, end_col) in spans {
552            let column_count = end_col - start_col + 1;
553            if column_count < 2 {
554                continue;
555            }
556
557            if !is_header_like(sheet, row, start_col, end_col) {
558                continue;
559            }
560
561            if row > 1 && previous_row_blocks_header_candidate(sheet, row, start_col, end_col) {
562                continue;
563            }
564
565            let Some(end_row) = extend_candidate(sheet, row, start_col, end_col) else {
566                continue;
567            };
568
569            let confidence = score_candidate(sheet, row, start_col, end_row, end_col);
570            if confidence < 0.5 {
571                continue;
572            }
573
574            candidates.push(TableCandidate {
575                start_row: row,
576                start_col,
577                end_row,
578                end_col,
579                confidence,
580            });
581        }
582    }
583
584    candidates.sort_by_key(|candidate| (candidate.start_row, candidate.start_col));
585    let candidates = remove_duplicate_candidates(candidates);
586
587    candidates
588        .into_iter()
589        .map(|candidate| {
590            let range = format_range(
591                candidate.start_row,
592                candidate.start_col,
593                candidate.end_row,
594                candidate.end_col,
595            );
596            json!({
597                "range": range,
598                "header_row": candidate.start_row,
599                "column_count": candidate.end_col - candidate.start_col + 1,
600                "row_count": candidate.end_row - candidate.start_row + 1,
601                "confidence": candidate.confidence,
602            })
603        })
604        .collect()
605}
606
607fn non_empty_spans(sheet: &Sheet, row: usize) -> Vec<(usize, usize)> {
608    let mut spans = Vec::new();
609    let mut current_start = None;
610
611    for col in 1..=sheet.max_cols {
612        if cell_has_value(sheet, row, col) {
613            current_start.get_or_insert(col);
614        } else if let Some(start) = current_start.take() {
615            spans.push((start, col - 1));
616        }
617    }
618
619    if let Some(start) = current_start {
620        spans.push((start, sheet.max_cols));
621    }
622
623    spans
624}
625
626fn is_header_like(sheet: &Sheet, row: usize, start_col: usize, end_col: usize) -> bool {
627    let width = end_col - start_col + 1;
628    let non_empty = non_empty_count(sheet, row, start_col, end_col);
629    if non_empty < 2 {
630        return false;
631    }
632
633    let fill_ratio = non_empty as f64 / width as f64;
634    let text_ratio = text_count(sheet, row, start_col, end_col) as f64 / non_empty as f64;
635    fill_ratio >= 0.8 && text_ratio >= 0.5
636}
637
638fn extend_candidate(
639    sheet: &Sheet,
640    header_row: usize,
641    start_col: usize,
642    end_col: usize,
643) -> Option<usize> {
644    let mut end_row = header_row;
645    let mut has_data_row = false;
646
647    for row in (header_row + 1)..=sheet.max_rows {
648        if row_has_any_value(sheet, row, start_col, end_col) {
649            end_row = row;
650            has_data_row = true;
651        } else {
652            break;
653        }
654    }
655
656    has_data_row.then_some(end_row)
657}
658
659fn score_candidate(
660    sheet: &Sheet,
661    header_row: usize,
662    start_col: usize,
663    end_row: usize,
664    end_col: usize,
665) -> f64 {
666    let width = end_col - start_col + 1;
667    let header_non_empty = non_empty_count(sheet, header_row, start_col, end_col);
668    let header_fill = header_non_empty as f64 / width as f64;
669    let header_text =
670        text_count(sheet, header_row, start_col, end_col) as f64 / header_non_empty.max(1) as f64;
671
672    let data_rows = end_row.saturating_sub(header_row);
673    let data_fill = if data_rows == 0 {
674        0.0
675    } else {
676        let filled_cells: usize = ((header_row + 1)..=end_row)
677            .map(|row| non_empty_count(sheet, row, start_col, end_col))
678            .sum();
679        filled_cells as f64 / (data_rows * width) as f64
680    };
681
682    let height_score = (data_rows as f64 / 3.0).min(1.0);
683    let before_boundary =
684        header_row == 1 || !row_has_any_value(sheet, header_row - 1, start_col, end_col);
685    let after_boundary =
686        end_row == sheet.max_rows || !row_has_any_value(sheet, end_row + 1, start_col, end_col);
687    let boundary_score = match (before_boundary, after_boundary) {
688        (true, true) => 1.0,
689        (true, false) | (false, true) => 0.5,
690        (false, false) => 0.0,
691    };
692
693    let confidence = header_fill * 0.25
694        + header_text * 0.25
695        + data_fill * 0.25
696        + height_score * 0.15
697        + boundary_score * 0.10;
698
699    (confidence.clamp(0.0, 1.0) * 100.0).round() / 100.0
700}
701
702fn remove_duplicate_candidates(candidates: Vec<TableCandidate>) -> Vec<TableCandidate> {
703    let mut kept: Vec<TableCandidate> = Vec::new();
704
705    'candidate: for candidate in candidates {
706        for existing in &kept {
707            if same_table_region(existing, &candidate) {
708                continue 'candidate;
709            }
710        }
711        kept.push(candidate);
712    }
713
714    kept
715}
716
717fn same_table_region(left: &TableCandidate, right: &TableCandidate) -> bool {
718    left.start_row == right.start_row
719        && left.start_col == right.start_col
720        && left.end_row == right.end_row
721        && left.end_col == right.end_col
722}
723
724fn previous_row_blocks_header_candidate(
725    sheet: &Sheet,
726    row: usize,
727    start_col: usize,
728    end_col: usize,
729) -> bool {
730    let width = end_col - start_col + 1;
731    let previous_non_empty = non_empty_count(sheet, row - 1, start_col, end_col);
732    previous_non_empty >= 2 && previous_non_empty as f64 / width as f64 >= 0.5
733}
734
735fn row_has_any_value(sheet: &Sheet, row: usize, start_col: usize, end_col: usize) -> bool {
736    (start_col..=end_col).any(|col| cell_has_value(sheet, row, col))
737}
738
739fn non_empty_count(sheet: &Sheet, row: usize, start_col: usize, end_col: usize) -> usize {
740    (start_col..=end_col)
741        .filter(|&col| cell_has_value(sheet, row, col))
742        .count()
743}
744
745fn text_count(sheet: &Sheet, row: usize, start_col: usize, end_col: usize) -> usize {
746    (start_col..=end_col)
747        .filter(|&col| {
748            cell_has_value(sheet, row, col)
749                && matches!(table_cell_type(sheet, row, col), Some(CellType::Text))
750        })
751        .count()
752}
753
754fn cell_has_value(sheet: &Sheet, row: usize, col: usize) -> bool {
755    sheet
756        .data
757        .get(row)
758        .and_then(|row_data| row_data.get(col))
759        .map(|cell| !cell.value.trim().is_empty() || cell.formula.is_some())
760        .unwrap_or(false)
761}
762
763fn table_cell_type(sheet: &Sheet, row: usize, col: usize) -> Option<CellType> {
764    sheet
765        .data
766        .get(row)
767        .and_then(|row_data| row_data.get(col))
768        .map(|cell| cell.cell_type.clone())
769}