1use std::fs;
10use std::path::PathBuf;
11
12use anyhow::{bail, Result};
13use clap::{Args, Subcommand, ValueEnum};
14use memvid_core::table::{
15 export_to_csv, export_to_json, extract_tables, get_table, list_tables, store_table,
16 ExtractionMode, TableExtractionOptions, TableQuality,
17};
18use memvid_core::Memvid;
19use serde_json::json;
20
21use crate::config::CliConfig;
22
23#[derive(Args)]
25pub struct TablesArgs {
26 #[command(subcommand)]
27 pub command: TablesCommand,
28}
29
30#[derive(Subcommand)]
32pub enum TablesCommand {
33 Import(TablesImportArgs),
35 List(TablesListArgs),
37 Export(TablesExportArgs),
39 View(TablesViewArgs),
41}
42
43#[derive(Args)]
45pub struct TablesImportArgs {
46 #[arg(value_name = "FILE", value_parser = clap::value_parser!(PathBuf))]
48 pub file: PathBuf,
49
50 #[arg(long = "input", short = 'i', value_name = "PATH", value_parser = clap::value_parser!(PathBuf))]
52 pub input: PathBuf,
53
54 #[arg(long = "mode", value_enum, default_value = "conservative")]
56 pub mode: ExtractionModeArg,
57
58 #[arg(long = "min-rows", default_value = "2")]
60 pub min_rows: usize,
61
62 #[arg(long = "min-cols", default_value = "2")]
64 pub min_cols: usize,
65
66 #[arg(long = "min-quality", value_enum, default_value = "medium")]
68 pub min_quality: QualityArg,
69
70 #[arg(long = "merge-multi-page", default_value = "true")]
72 pub merge_multi_page: bool,
73
74 #[arg(long = "max-pages", default_value = "0")]
76 pub max_pages: usize,
77
78 #[arg(long = "embed-rows", default_value = "true")]
80 pub embed_rows: bool,
81
82 #[arg(long)]
84 pub json: bool,
85}
86
87#[derive(Args)]
89pub struct TablesListArgs {
90 #[arg(value_name = "FILE", value_parser = clap::value_parser!(PathBuf))]
92 pub file: PathBuf,
93
94 #[arg(long)]
96 pub json: bool,
97}
98
99#[derive(Args)]
101pub struct TablesExportArgs {
102 #[arg(value_name = "FILE", value_parser = clap::value_parser!(PathBuf))]
104 pub file: PathBuf,
105
106 #[arg(long = "table-id", value_name = "ID")]
108 pub table_id: String,
109
110 #[arg(long = "out", short = 'o', value_name = "PATH", value_parser = clap::value_parser!(PathBuf))]
112 pub out: Option<PathBuf>,
113
114 #[arg(long = "format", value_enum, default_value = "csv")]
116 pub format: ExportFormatArg,
117
118 #[arg(long = "as-records")]
120 pub as_records: bool,
121}
122
123#[derive(Args)]
125pub struct TablesViewArgs {
126 #[arg(value_name = "FILE", value_parser = clap::value_parser!(PathBuf))]
128 pub file: PathBuf,
129
130 #[arg(long = "table-id", value_name = "ID")]
132 pub table_id: String,
133
134 #[arg(long)]
136 pub json: bool,
137
138 #[arg(long = "limit", default_value = "50")]
140 pub limit: usize,
141}
142
143#[derive(Copy, Clone, Debug, Eq, PartialEq, ValueEnum)]
145pub enum ExtractionModeArg {
146 LatticeOnly,
148 StreamOnly,
150 Conservative,
152 Aggressive,
154}
155
156impl From<ExtractionModeArg> for ExtractionMode {
157 fn from(value: ExtractionModeArg) -> Self {
158 match value {
159 ExtractionModeArg::LatticeOnly => ExtractionMode::LatticeOnly,
160 ExtractionModeArg::StreamOnly => ExtractionMode::StreamOnly,
161 ExtractionModeArg::Conservative => ExtractionMode::Conservative,
162 ExtractionModeArg::Aggressive => ExtractionMode::Aggressive,
163 }
164 }
165}
166
167#[derive(Copy, Clone, Debug, Eq, PartialEq, ValueEnum)]
169pub enum QualityArg {
170 High,
171 Medium,
172 Low,
173}
174
175impl From<QualityArg> for TableQuality {
176 fn from(value: QualityArg) -> Self {
177 match value {
178 QualityArg::High => TableQuality::High,
179 QualityArg::Medium => TableQuality::Medium,
180 QualityArg::Low => TableQuality::Low,
181 }
182 }
183}
184
185#[derive(Copy, Clone, Debug, Eq, PartialEq, ValueEnum)]
187pub enum ExportFormatArg {
188 Csv,
189 Json,
190}
191
192pub fn handle_tables(_config: &CliConfig, args: TablesArgs) -> Result<()> {
198 match args.command {
199 TablesCommand::Import(import_args) => handle_tables_import(import_args),
200 TablesCommand::List(list_args) => handle_tables_list(list_args),
201 TablesCommand::Export(export_args) => handle_tables_export(export_args),
202 TablesCommand::View(view_args) => handle_tables_view(view_args),
203 }
204}
205
206fn handle_tables_import(args: TablesImportArgs) -> Result<()> {
208 let input_bytes = fs::read(&args.input)?;
210 let filename = args
211 .input
212 .file_name()
213 .and_then(|s| s.to_str())
214 .unwrap_or("unknown");
215
216 let options = TableExtractionOptions::builder()
218 .mode(args.mode.into())
219 .min_rows(args.min_rows)
220 .min_cols(args.min_cols)
221 .min_quality(args.min_quality.into())
222 .merge_multi_page(args.merge_multi_page)
223 .max_pages(args.max_pages)
224 .build();
225
226 let result = extract_tables(&input_bytes, filename, &options)?;
228
229 if result.tables.is_empty() {
230 if args.json {
231 println!(
232 "{}",
233 serde_json::to_string_pretty(&json!({
234 "tables_found": 0,
235 "tables_stored": 0,
236 "warnings": result.warnings,
237 }))?
238 );
239 } else {
240 println!("No tables found in {}", filename);
241 if !result.warnings.is_empty() {
242 println!("\nWarnings:");
243 for warning in &result.warnings {
244 println!(" - {}", warning);
245 }
246 }
247 }
248 return Ok(());
249 }
250
251 let mut mem = Memvid::open(&args.file)?;
253 let mut stored_tables = Vec::new();
254
255 for table in &result.tables {
256 let (meta_id, row_ids) = store_table(&mut mem, table, args.embed_rows)?;
257 stored_tables.push(json!({
258 "table_id": table.table_id,
259 "meta_frame_id": meta_id,
260 "row_frame_ids": row_ids,
261 "rows": table.n_rows,
262 "cols": table.n_cols,
263 "quality": format!("{:?}", table.quality),
264 "detection_mode": format!("{:?}", table.detection_mode),
265 "pages": format!("{}-{}", table.page_start, table.page_end),
266 }));
267 }
268
269 if args.json {
270 println!(
271 "{}",
272 serde_json::to_string_pretty(&json!({
273 "tables_found": result.tables.len(),
274 "tables_stored": stored_tables.len(),
275 "extraction_ms": result.total_ms,
276 "tables": stored_tables,
277 "warnings": result.warnings,
278 }))?
279 );
280 } else {
281 println!(
282 "Extracted {} tables from {} in {} ms",
283 result.tables.len(),
284 filename,
285 result.total_ms
286 );
287 println!();
288 for (i, table) in result.tables.iter().enumerate() {
289 println!(
290 "Table {}: {} rows × {} cols ({:?}, {:?})",
291 i + 1,
292 table.n_rows,
293 table.n_cols,
294 table.quality,
295 table.detection_mode
296 );
297 println!(
298 " Pages: {}-{}, Confidence: {:.2}",
299 table.page_start, table.page_end, table.confidence_score
300 );
301 if !table.headers.is_empty() {
302 let header_preview: Vec<_> = table
303 .headers
304 .iter()
305 .take(5)
306 .map(|s| truncate_string(s, 20))
307 .collect();
308 let suffix = if table.headers.len() > 5 {
309 format!(" ... ({} more)", table.headers.len() - 5)
310 } else {
311 String::new()
312 };
313 println!(" Headers: [{}]{}", header_preview.join(", "), suffix);
314 }
315 }
316 if !result.warnings.is_empty() {
317 println!("\nWarnings:");
318 for warning in &result.warnings {
319 println!(" - {}", warning);
320 }
321 }
322 }
323
324 Ok(())
325}
326
327fn handle_tables_list(args: TablesListArgs) -> Result<()> {
329 let mut mem = Memvid::open(&args.file)?;
330 let tables = list_tables(&mut mem)?;
331
332 if args.json {
333 let json_tables: Vec<_> = tables
334 .iter()
335 .map(|t| {
336 json!({
337 "table_id": t.table_id,
338 "frame_id": t.frame_id,
339 "source_file": t.source_file,
340 "n_rows": t.n_rows,
341 "n_cols": t.n_cols,
342 "pages": format!("{}-{}", t.page_start, t.page_end),
343 "quality": format!("{:?}", t.quality),
344 "headers": t.headers,
345 })
346 })
347 .collect();
348 println!(
349 "{}",
350 serde_json::to_string_pretty(&json!({
351 "count": tables.len(),
352 "tables": json_tables,
353 }))?
354 );
355 } else if tables.is_empty() {
356 println!("No tables stored in this memory.");
357 } else {
358 println!("Tables in memory ({}):", tables.len());
359 println!();
360 for table in &tables {
361 println!(
362 " {} — {} rows × {} cols",
363 table.table_id, table.n_rows, table.n_cols
364 );
365 println!(
366 " Source: {}, Pages: {}-{}, Quality: {:?}",
367 table.source_file, table.page_start, table.page_end, table.quality
368 );
369 if !table.headers.is_empty() {
370 let header_preview: Vec<_> = table
371 .headers
372 .iter()
373 .take(4)
374 .map(|s| truncate_string(s, 15))
375 .collect();
376 let suffix = if table.headers.len() > 4 {
377 format!(" ... (+{})", table.headers.len() - 4)
378 } else {
379 String::new()
380 };
381 println!(" Headers: [{}]{}", header_preview.join(", "), suffix);
382 }
383 println!();
384 }
385 }
386
387 Ok(())
388}
389
390fn handle_tables_export(args: TablesExportArgs) -> Result<()> {
392 let mut mem = Memvid::open(&args.file)?;
393
394 let table = get_table(&mut mem, &args.table_id)?;
395 let table = match table {
396 Some(t) => t,
397 None => bail!("Table '{}' not found", args.table_id),
398 };
399
400 let output = match args.format {
401 ExportFormatArg::Csv => export_to_csv(&table),
402 ExportFormatArg::Json => export_to_json(&table, args.as_records)?,
403 };
404
405 if let Some(out_path) = args.out {
406 fs::write(&out_path, &output)?;
407 println!(
408 "Exported table '{}' to {}",
409 args.table_id,
410 out_path.display()
411 );
412 } else {
413 println!("{}", output);
414 }
415
416 Ok(())
417}
418
419fn handle_tables_view(args: TablesViewArgs) -> Result<()> {
421 let mut mem = Memvid::open(&args.file)?;
422
423 let table = get_table(&mut mem, &args.table_id)?;
424 let table = match table {
425 Some(t) => t,
426 None => bail!("Table '{}' not found", args.table_id),
427 };
428
429 if args.json {
430 println!(
431 "{}",
432 serde_json::to_string_pretty(&json!({
433 "table_id": table.table_id,
434 "source_file": table.source_file,
435 "page_start": table.page_start,
436 "page_end": table.page_end,
437 "n_rows": table.n_rows,
438 "n_cols": table.n_cols,
439 "quality": format!("{:?}", table.quality),
440 "detection_mode": format!("{:?}", table.detection_mode),
441 "confidence_score": table.confidence_score,
442 "headers": table.headers,
443 "rows": table.rows.iter().take(if args.limit == 0 { usize::MAX } else { args.limit }).map(|r| {
444 json!({
445 "row_index": r.row_index,
446 "page": r.page,
447 "is_header": r.is_header_row,
448 "cells": r.cells.iter().map(|c| {
449 json!({
450 "text": c.text,
451 "col_index": c.col_index,
452 "is_header": c.is_header,
453 "col_span": c.col_span,
454 "row_span": c.row_span,
455 })
456 }).collect::<Vec<_>>(),
457 })
458 }).collect::<Vec<_>>(),
459 "warnings": table.warnings,
460 }))?
461 );
462 } else {
463 println!("Table: {}", table.table_id);
464 println!("Source: {}", table.source_file);
465 println!(
466 "Pages: {}-{}, Quality: {:?}, Mode: {:?}",
467 table.page_start, table.page_end, table.quality, table.detection_mode
468 );
469 println!(
470 "Size: {} rows × {} cols, Confidence: {:.2}",
471 table.n_rows, table.n_cols, table.confidence_score
472 );
473 println!();
474
475 let mut col_widths: Vec<usize> = vec![0; table.n_cols];
477 for (i, header) in table.headers.iter().enumerate() {
478 if i < col_widths.len() {
479 col_widths[i] = col_widths[i].max(header.len().min(30));
480 }
481 }
482 for row in &table.rows {
483 for cell in &row.cells {
484 if cell.col_index < col_widths.len() {
485 col_widths[cell.col_index] =
486 col_widths[cell.col_index].max(cell.text.len().min(30));
487 }
488 }
489 }
490
491 if !table.headers.is_empty() {
493 let header_line: Vec<String> = table
494 .headers
495 .iter()
496 .enumerate()
497 .map(|(i, h)| {
498 let width = col_widths.get(i).copied().unwrap_or(10);
499 format!("{:width$}", truncate_string(h, width), width = width)
500 })
501 .collect();
502 println!("| {} |", header_line.join(" | "));
503 let separator: Vec<String> = col_widths.iter().map(|w| "-".repeat(*w)).collect();
504 println!("|-{}-|", separator.join("-|-"));
505 }
506
507 let limit = if args.limit == 0 {
509 usize::MAX
510 } else {
511 args.limit
512 };
513 let rows_to_show: Vec<_> = table
514 .rows
515 .iter()
516 .filter(|r| !r.is_header_row)
517 .take(limit)
518 .collect();
519
520 for row in &rows_to_show {
521 let mut cell_texts: Vec<String> = vec![String::new(); table.n_cols];
522 for cell in &row.cells {
523 if cell.col_index < cell_texts.len() {
524 cell_texts[cell.col_index] = cell.text.clone();
525 }
526 }
527 let row_line: Vec<String> = cell_texts
528 .iter()
529 .enumerate()
530 .map(|(i, text)| {
531 let width = col_widths.get(i).copied().unwrap_or(10);
532 format!("{:width$}", truncate_string(text, width), width = width)
533 })
534 .collect();
535 println!("| {} |", row_line.join(" | "));
536 }
537
538 let total_data_rows = table.rows.iter().filter(|r| !r.is_header_row).count();
539 if rows_to_show.len() < total_data_rows {
540 println!(
541 "\n... showing {} of {} rows (use --limit 0 to show all)",
542 rows_to_show.len(),
543 total_data_rows
544 );
545 }
546
547 if !table.warnings.is_empty() {
548 println!("\nWarnings:");
549 for warning in &table.warnings {
550 println!(" - {}", warning);
551 }
552 }
553 }
554
555 Ok(())
556}
557
558fn truncate_string(s: &str, max_len: usize) -> String {
560 if s.len() <= max_len {
561 s.to_string()
562 } else if max_len <= 3 {
563 s.chars().take(max_len).collect()
564 } else {
565 format!("{}...", s.chars().take(max_len - 3).collect::<String>())
566 }
567}