Skip to main content

dataprof_core/
partial.rs

1use crate::classification::DataType;
2
3/// Result of fast schema inference — column names paired with inferred data types.
4#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
5pub struct SchemaResult {
6    /// Columns with their inferred types. For CSV/Parquet the order matches
7    /// the source; for JSON/JSONL columns are sorted alphabetically.
8    pub columns: Vec<ColumnSchema>,
9    /// How many rows were sampled to infer the schema (0 for Parquet metadata).
10    pub rows_sampled: usize,
11    /// Time taken for inference in milliseconds.
12    pub inference_time_ms: u128,
13    /// `true` when the entire file was consumed or schema was read from
14    /// metadata; `false` when inference stopped at the sample-size cap and
15    /// the schema may not have fully stabilized.
16    pub schema_stable: bool,
17}
18
19/// A single column's name and inferred data type.
20#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
21pub struct ColumnSchema {
22    pub name: String,
23    pub data_type: DataType,
24}
25
26/// Result of a quick row count operation.
27#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
28pub struct RowCountEstimate {
29    /// The estimated or exact row count.
30    pub count: u64,
31    /// Whether the count is exact or an estimate.
32    pub exact: bool,
33    /// How the count was obtained.
34    pub method: CountMethod,
35    /// Time taken in milliseconds.
36    pub count_time_ms: u128,
37}
38
39/// Method used to obtain the row count.
40#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
41pub enum CountMethod {
42    /// Read from Parquet file footer metadata (exact, zero row reading).
43    ParquetMetadata,
44    /// Full scan of the file (exact).
45    FullScan,
46    /// Sample-based estimation (approximate).
47    Sampling,
48    /// Full scan of a streaming source (no file metadata available).
49    StreamFullScan,
50}