dataprof_core/partial.rs
1use crate::classification::DataType;
2
3/// Result of fast schema inference — column names paired with inferred data types.
4#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
5pub struct SchemaResult {
6 /// Columns with their inferred types. For CSV/Parquet the order matches
7 /// the source; for JSON/JSONL columns are sorted alphabetically.
8 pub columns: Vec<ColumnSchema>,
9 /// How many rows were sampled to infer the schema (0 for Parquet metadata).
10 pub rows_sampled: usize,
11 /// Time taken for inference in milliseconds.
12 pub inference_time_ms: u128,
13 /// `true` when the entire file was consumed or schema was read from
14 /// metadata; `false` when inference stopped at the sample-size cap and
15 /// the schema may not have fully stabilized.
16 pub schema_stable: bool,
17}
18
19/// A single column's name and inferred data type.
20#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
21pub struct ColumnSchema {
22 pub name: String,
23 pub data_type: DataType,
24}
25
26/// Result of a quick row count operation.
27#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
28pub struct RowCountEstimate {
29 /// The estimated or exact row count.
30 pub count: u64,
31 /// Whether the count is exact or an estimate.
32 pub exact: bool,
33 /// How the count was obtained.
34 pub method: CountMethod,
35 /// Time taken in milliseconds.
36 pub count_time_ms: u128,
37}
38
39/// Method used to obtain the row count.
40#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
41pub enum CountMethod {
42 /// Read from Parquet file footer metadata (exact, zero row reading).
43 ParquetMetadata,
44 /// Full scan of the file (exact).
45 FullScan,
46 /// Sample-based estimation (approximate).
47 Sampling,
48 /// Full scan of a streaming source (no file metadata available).
49 StreamFullScan,
50}