Skip to main content

memvid_cli/commands/
tables.rs

1//! Table extraction and management commands.
2//!
3//! Provides CLI commands for:
4//! - Importing tables from PDF documents
5//! - Listing tables stored in an MV2 file
6//! - Exporting tables to CSV/JSON
7//! - Viewing individual tables
8
9use std::fs;
10use std::path::PathBuf;
11
12use anyhow::{bail, Result};
13use clap::{Args, Subcommand, ValueEnum};
14use memvid_core::table::{
15    export_to_csv, export_to_json, extract_tables, get_table, list_tables, store_table,
16    ExtractionMode, TableExtractionOptions, TableQuality,
17};
18use memvid_core::Memvid;
19use serde_json::json;
20
21use crate::config::CliConfig;
22
23/// Arguments for the `tables` command
24#[derive(Args)]
25pub struct TablesArgs {
26    #[command(subcommand)]
27    pub command: TablesCommand,
28}
29
30/// Tables subcommands
31#[derive(Subcommand)]
32pub enum TablesCommand {
33    /// Import tables from a document (PDF, DOCX, XLSX)
34    Import(TablesImportArgs),
35    /// List all tables stored in the memory
36    List(TablesListArgs),
37    /// Export a table to CSV or JSON
38    Export(TablesExportArgs),
39    /// View a specific table
40    View(TablesViewArgs),
41}
42
43/// Arguments for `tables import`
44#[derive(Args)]
45pub struct TablesImportArgs {
46    /// MV2 file to store tables in
47    #[arg(value_name = "FILE", value_parser = clap::value_parser!(PathBuf))]
48    pub file: PathBuf,
49
50    /// Input document (PDF, DOCX, XLSX)
51    #[arg(long = "input", short = 'i', value_name = "PATH", value_parser = clap::value_parser!(PathBuf))]
52    pub input: PathBuf,
53
54    /// Extraction mode
55    #[arg(long = "mode", value_enum, default_value = "conservative")]
56    pub mode: ExtractionModeArg,
57
58    /// Minimum number of rows required
59    #[arg(long = "min-rows", default_value = "2")]
60    pub min_rows: usize,
61
62    /// Minimum number of columns required
63    #[arg(long = "min-cols", default_value = "2")]
64    pub min_cols: usize,
65
66    /// Minimum quality threshold
67    #[arg(long = "min-quality", value_enum, default_value = "medium")]
68    pub min_quality: QualityArg,
69
70    /// Enable multi-page table merging
71    #[arg(long = "merge-multi-page", default_value = "true")]
72    pub merge_multi_page: bool,
73
74    /// Maximum pages to process (0 = all)
75    #[arg(long = "max-pages", default_value = "0")]
76    pub max_pages: usize,
77
78    /// Embed table rows for search indexing
79    #[arg(long = "embed-rows", default_value = "true")]
80    pub embed_rows: bool,
81
82    /// Output as JSON
83    #[arg(long)]
84    pub json: bool,
85}
86
87/// Arguments for `tables list`
88#[derive(Args)]
89pub struct TablesListArgs {
90    /// MV2 file to list tables from
91    #[arg(value_name = "FILE", value_parser = clap::value_parser!(PathBuf))]
92    pub file: PathBuf,
93
94    /// Output as JSON
95    #[arg(long)]
96    pub json: bool,
97}
98
99/// Arguments for `tables export`
100#[derive(Args)]
101pub struct TablesExportArgs {
102    /// MV2 file containing the table
103    #[arg(value_name = "FILE", value_parser = clap::value_parser!(PathBuf))]
104    pub file: PathBuf,
105
106    /// Table ID to export
107    #[arg(long = "table-id", value_name = "ID")]
108    pub table_id: String,
109
110    /// Output file path
111    #[arg(long = "out", short = 'o', value_name = "PATH", value_parser = clap::value_parser!(PathBuf))]
112    pub out: Option<PathBuf>,
113
114    /// Output format
115    #[arg(long = "format", value_enum, default_value = "csv")]
116    pub format: ExportFormatArg,
117
118    /// For JSON: output as array of records instead of table object
119    #[arg(long = "as-records")]
120    pub as_records: bool,
121}
122
123/// Arguments for `tables view`
124#[derive(Args)]
125pub struct TablesViewArgs {
126    /// MV2 file containing the table
127    #[arg(value_name = "FILE", value_parser = clap::value_parser!(PathBuf))]
128    pub file: PathBuf,
129
130    /// Table ID to view
131    #[arg(long = "table-id", value_name = "ID")]
132    pub table_id: String,
133
134    /// Output as JSON
135    #[arg(long)]
136    pub json: bool,
137
138    /// Maximum rows to display (0 = all)
139    #[arg(long = "limit", default_value = "50")]
140    pub limit: usize,
141}
142
143/// Extraction mode CLI argument
144#[derive(Copy, Clone, Debug, Eq, PartialEq, ValueEnum)]
145pub enum ExtractionModeArg {
146    /// Only detect tables with visible grid lines
147    LatticeOnly,
148    /// Only detect tables from text alignment
149    StreamOnly,
150    /// Try lattice first, fall back to stream
151    Conservative,
152    /// Aggressive detection (both methods, more false positives)
153    Aggressive,
154}
155
156impl From<ExtractionModeArg> for ExtractionMode {
157    fn from(value: ExtractionModeArg) -> Self {
158        match value {
159            ExtractionModeArg::LatticeOnly => ExtractionMode::LatticeOnly,
160            ExtractionModeArg::StreamOnly => ExtractionMode::StreamOnly,
161            ExtractionModeArg::Conservative => ExtractionMode::Conservative,
162            ExtractionModeArg::Aggressive => ExtractionMode::Aggressive,
163        }
164    }
165}
166
167/// Quality threshold CLI argument
168#[derive(Copy, Clone, Debug, Eq, PartialEq, ValueEnum)]
169pub enum QualityArg {
170    High,
171    Medium,
172    Low,
173}
174
175impl From<QualityArg> for TableQuality {
176    fn from(value: QualityArg) -> Self {
177        match value {
178            QualityArg::High => TableQuality::High,
179            QualityArg::Medium => TableQuality::Medium,
180            QualityArg::Low => TableQuality::Low,
181        }
182    }
183}
184
185/// Export format CLI argument
186#[derive(Copy, Clone, Debug, Eq, PartialEq, ValueEnum)]
187pub enum ExportFormatArg {
188    Csv,
189    Json,
190}
191
192// ============================================================================
193// Command handlers
194// ============================================================================
195
196/// Main dispatcher for tables commands
197pub fn handle_tables(_config: &CliConfig, args: TablesArgs) -> Result<()> {
198    match args.command {
199        TablesCommand::Import(import_args) => handle_tables_import(import_args),
200        TablesCommand::List(list_args) => handle_tables_list(list_args),
201        TablesCommand::Export(export_args) => handle_tables_export(export_args),
202        TablesCommand::View(view_args) => handle_tables_view(view_args),
203    }
204}
205
206/// Handle `tables import` command
207fn handle_tables_import(args: TablesImportArgs) -> Result<()> {
208    // Read input document
209    let input_bytes = fs::read(&args.input)?;
210    let filename = args
211        .input
212        .file_name()
213        .and_then(|s| s.to_str())
214        .unwrap_or("unknown");
215
216    // Build extraction options
217    let options = TableExtractionOptions::builder()
218        .mode(args.mode.into())
219        .min_rows(args.min_rows)
220        .min_cols(args.min_cols)
221        .min_quality(args.min_quality.into())
222        .merge_multi_page(args.merge_multi_page)
223        .max_pages(args.max_pages)
224        .build();
225
226    // Extract tables
227    let result = extract_tables(&input_bytes, filename, &options)?;
228
229    if result.tables.is_empty() {
230        if args.json {
231            println!(
232                "{}",
233                serde_json::to_string_pretty(&json!({
234                    "tables_found": 0,
235                    "tables_stored": 0,
236                    "warnings": result.warnings,
237                }))?
238            );
239        } else {
240            println!("No tables found in {}", filename);
241            if !result.warnings.is_empty() {
242                println!("\nWarnings:");
243                for warning in &result.warnings {
244                    println!("  - {}", warning);
245                }
246            }
247        }
248        return Ok(());
249    }
250
251    // Open MV2 file and store tables
252    let mut mem = Memvid::open(&args.file)?;
253    let mut stored_tables = Vec::new();
254
255    for table in &result.tables {
256        let (meta_id, row_ids) = store_table(&mut mem, table, args.embed_rows)?;
257        stored_tables.push(json!({
258            "table_id": table.table_id,
259            "meta_frame_id": meta_id,
260            "row_frame_ids": row_ids,
261            "rows": table.n_rows,
262            "cols": table.n_cols,
263            "quality": format!("{:?}", table.quality),
264            "detection_mode": format!("{:?}", table.detection_mode),
265            "pages": format!("{}-{}", table.page_start, table.page_end),
266        }));
267    }
268
269    if args.json {
270        println!(
271            "{}",
272            serde_json::to_string_pretty(&json!({
273                "tables_found": result.tables.len(),
274                "tables_stored": stored_tables.len(),
275                "extraction_ms": result.total_ms,
276                "tables": stored_tables,
277                "warnings": result.warnings,
278            }))?
279        );
280    } else {
281        println!(
282            "Extracted {} tables from {} in {} ms",
283            result.tables.len(),
284            filename,
285            result.total_ms
286        );
287        println!();
288        for (i, table) in result.tables.iter().enumerate() {
289            println!(
290                "Table {}: {} rows × {} cols ({:?}, {:?})",
291                i + 1,
292                table.n_rows,
293                table.n_cols,
294                table.quality,
295                table.detection_mode
296            );
297            println!(
298                "  Pages: {}-{}, Confidence: {:.2}",
299                table.page_start, table.page_end, table.confidence_score
300            );
301            if !table.headers.is_empty() {
302                let header_preview: Vec<_> = table
303                    .headers
304                    .iter()
305                    .take(5)
306                    .map(|s| truncate_string(s, 20))
307                    .collect();
308                let suffix = if table.headers.len() > 5 {
309                    format!(" ... ({} more)", table.headers.len() - 5)
310                } else {
311                    String::new()
312                };
313                println!("  Headers: [{}]{}", header_preview.join(", "), suffix);
314            }
315        }
316        if !result.warnings.is_empty() {
317            println!("\nWarnings:");
318            for warning in &result.warnings {
319                println!("  - {}", warning);
320            }
321        }
322    }
323
324    Ok(())
325}
326
327/// Handle `tables list` command
328fn handle_tables_list(args: TablesListArgs) -> Result<()> {
329    let mut mem = Memvid::open(&args.file)?;
330    let tables = list_tables(&mut mem)?;
331
332    if args.json {
333        let json_tables: Vec<_> = tables
334            .iter()
335            .map(|t| {
336                json!({
337                    "table_id": t.table_id,
338                    "frame_id": t.frame_id,
339                    "source_file": t.source_file,
340                    "n_rows": t.n_rows,
341                    "n_cols": t.n_cols,
342                    "pages": format!("{}-{}", t.page_start, t.page_end),
343                    "quality": format!("{:?}", t.quality),
344                    "headers": t.headers,
345                })
346            })
347            .collect();
348        println!(
349            "{}",
350            serde_json::to_string_pretty(&json!({
351                "count": tables.len(),
352                "tables": json_tables,
353            }))?
354        );
355    } else if tables.is_empty() {
356        println!("No tables stored in this memory.");
357    } else {
358        println!("Tables in memory ({}):", tables.len());
359        println!();
360        for table in &tables {
361            println!(
362                "  {} — {} rows × {} cols",
363                table.table_id, table.n_rows, table.n_cols
364            );
365            println!(
366                "    Source: {}, Pages: {}-{}, Quality: {:?}",
367                table.source_file, table.page_start, table.page_end, table.quality
368            );
369            if !table.headers.is_empty() {
370                let header_preview: Vec<_> = table
371                    .headers
372                    .iter()
373                    .take(4)
374                    .map(|s| truncate_string(s, 15))
375                    .collect();
376                let suffix = if table.headers.len() > 4 {
377                    format!(" ... (+{})", table.headers.len() - 4)
378                } else {
379                    String::new()
380                };
381                println!("    Headers: [{}]{}", header_preview.join(", "), suffix);
382            }
383            println!();
384        }
385    }
386
387    Ok(())
388}
389
390/// Handle `tables export` command
391fn handle_tables_export(args: TablesExportArgs) -> Result<()> {
392    let mut mem = Memvid::open(&args.file)?;
393
394    let table = get_table(&mut mem, &args.table_id)?;
395    let table = match table {
396        Some(t) => t,
397        None => bail!("Table '{}' not found", args.table_id),
398    };
399
400    let output = match args.format {
401        ExportFormatArg::Csv => export_to_csv(&table),
402        ExportFormatArg::Json => export_to_json(&table, args.as_records)?,
403    };
404
405    if let Some(out_path) = args.out {
406        fs::write(&out_path, &output)?;
407        println!(
408            "Exported table '{}' to {}",
409            args.table_id,
410            out_path.display()
411        );
412    } else {
413        println!("{}", output);
414    }
415
416    Ok(())
417}
418
419/// Handle `tables view` command
420fn handle_tables_view(args: TablesViewArgs) -> Result<()> {
421    let mut mem = Memvid::open(&args.file)?;
422
423    let table = get_table(&mut mem, &args.table_id)?;
424    let table = match table {
425        Some(t) => t,
426        None => bail!("Table '{}' not found", args.table_id),
427    };
428
429    if args.json {
430        println!(
431            "{}",
432            serde_json::to_string_pretty(&json!({
433                "table_id": table.table_id,
434                "source_file": table.source_file,
435                "page_start": table.page_start,
436                "page_end": table.page_end,
437                "n_rows": table.n_rows,
438                "n_cols": table.n_cols,
439                "quality": format!("{:?}", table.quality),
440                "detection_mode": format!("{:?}", table.detection_mode),
441                "confidence_score": table.confidence_score,
442                "headers": table.headers,
443                "rows": table.rows.iter().take(if args.limit == 0 { usize::MAX } else { args.limit }).map(|r| {
444                    json!({
445                        "row_index": r.row_index,
446                        "page": r.page,
447                        "is_header": r.is_header_row,
448                        "cells": r.cells.iter().map(|c| {
449                            json!({
450                                "text": c.text,
451                                "col_index": c.col_index,
452                                "is_header": c.is_header,
453                                "col_span": c.col_span,
454                                "row_span": c.row_span,
455                            })
456                        }).collect::<Vec<_>>(),
457                    })
458                }).collect::<Vec<_>>(),
459                "warnings": table.warnings,
460            }))?
461        );
462    } else {
463        println!("Table: {}", table.table_id);
464        println!("Source: {}", table.source_file);
465        println!(
466            "Pages: {}-{}, Quality: {:?}, Mode: {:?}",
467            table.page_start, table.page_end, table.quality, table.detection_mode
468        );
469        println!(
470            "Size: {} rows × {} cols, Confidence: {:.2}",
471            table.n_rows, table.n_cols, table.confidence_score
472        );
473        println!();
474
475        // Calculate column widths
476        let mut col_widths: Vec<usize> = vec![0; table.n_cols];
477        for (i, header) in table.headers.iter().enumerate() {
478            if i < col_widths.len() {
479                col_widths[i] = col_widths[i].max(header.len().min(30));
480            }
481        }
482        for row in &table.rows {
483            for cell in &row.cells {
484                if cell.col_index < col_widths.len() {
485                    col_widths[cell.col_index] =
486                        col_widths[cell.col_index].max(cell.text.len().min(30));
487                }
488            }
489        }
490
491        // Print headers
492        if !table.headers.is_empty() {
493            let header_line: Vec<String> = table
494                .headers
495                .iter()
496                .enumerate()
497                .map(|(i, h)| {
498                    let width = col_widths.get(i).copied().unwrap_or(10);
499                    format!("{:width$}", truncate_string(h, width), width = width)
500                })
501                .collect();
502            println!("| {} |", header_line.join(" | "));
503            let separator: Vec<String> = col_widths.iter().map(|w| "-".repeat(*w)).collect();
504            println!("|-{}-|", separator.join("-|-"));
505        }
506
507        // Print rows
508        let limit = if args.limit == 0 {
509            usize::MAX
510        } else {
511            args.limit
512        };
513        let rows_to_show: Vec<_> = table
514            .rows
515            .iter()
516            .filter(|r| !r.is_header_row)
517            .take(limit)
518            .collect();
519
520        for row in &rows_to_show {
521            let mut cell_texts: Vec<String> = vec![String::new(); table.n_cols];
522            for cell in &row.cells {
523                if cell.col_index < cell_texts.len() {
524                    cell_texts[cell.col_index] = cell.text.clone();
525                }
526            }
527            let row_line: Vec<String> = cell_texts
528                .iter()
529                .enumerate()
530                .map(|(i, text)| {
531                    let width = col_widths.get(i).copied().unwrap_or(10);
532                    format!("{:width$}", truncate_string(text, width), width = width)
533                })
534                .collect();
535            println!("| {} |", row_line.join(" | "));
536        }
537
538        let total_data_rows = table.rows.iter().filter(|r| !r.is_header_row).count();
539        if rows_to_show.len() < total_data_rows {
540            println!(
541                "\n... showing {} of {} rows (use --limit 0 to show all)",
542                rows_to_show.len(),
543                total_data_rows
544            );
545        }
546
547        if !table.warnings.is_empty() {
548            println!("\nWarnings:");
549            for warning in &table.warnings {
550                println!("  - {}", warning);
551            }
552        }
553    }
554
555    Ok(())
556}
557
558/// Truncate a string to a maximum length
559fn truncate_string(s: &str, max_len: usize) -> String {
560    if s.len() <= max_len {
561        s.to_string()
562    } else if max_len <= 3 {
563        s.chars().take(max_len).collect()
564    } else {
565        format!("{}...", s.chars().take(max_len - 3).collect::<String>())
566    }
567}