#![doc(html_root_url = "https://docs.rs/tabkit")]
#![cfg_attr(docsrs, feature(doc_cfg))]
use std::path::Path;
mod error;
pub use error::{Error, Result};
#[cfg(feature = "calamine")]
mod calamine;
#[cfg(feature = "calamine")]
pub use crate::calamine::CalamineReader;
#[cfg(feature = "csv")]
mod csv;
#[cfg(feature = "csv")]
pub use crate::csv::CsvReader;
#[cfg(feature = "parquet")]
mod parquet;
#[cfg(feature = "parquet")]
pub use crate::parquet::ParquetReader;
#[derive(Debug, Clone, Default)]
#[non_exhaustive]
pub struct Table {
pub columns: Vec<Column>,
pub sample_rows: Vec<Row>,
pub row_count: Option<u64>,
pub metadata: std::collections::HashMap<String, String>,
}
impl Table {
#[must_use]
pub fn new(columns: Vec<Column>, sample_rows: Vec<Row>) -> Self {
Self {
columns,
sample_rows,
row_count: None,
metadata: std::collections::HashMap::new(),
}
}
}
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct Column {
pub name: String,
pub data_type: DataType,
pub nullable: bool,
}
impl Column {
#[must_use]
pub fn new(name: impl Into<String>, data_type: DataType, nullable: bool) -> Self {
Self {
name: name.into(),
data_type,
nullable,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
#[non_exhaustive]
pub enum DataType {
Bool,
Integer,
Float,
Date,
DateTime,
Text,
#[default]
Unknown,
}
pub type Row = Vec<Value>;
#[derive(Debug, Clone, PartialEq)]
#[non_exhaustive]
pub enum Value {
Null,
Bool(bool),
Integer(i64),
Float(f64),
Date(String),
DateTime(String),
Text(String),
}
impl Value {
#[must_use]
pub fn data_type(&self) -> Option<DataType> {
match self {
Self::Null => None,
Self::Bool(_) => Some(DataType::Bool),
Self::Integer(_) => Some(DataType::Integer),
Self::Float(_) => Some(DataType::Float),
Self::Date(_) => Some(DataType::Date),
Self::DateTime(_) => Some(DataType::DateTime),
Self::Text(_) => Some(DataType::Text),
}
}
}
#[derive(Debug, Clone)]
#[non_exhaustive]
pub struct ReadOptions {
pub max_sample_rows: usize,
pub sheet_name: Option<String>,
pub has_header: bool,
}
impl Default for ReadOptions {
fn default() -> Self {
Self {
max_sample_rows: 100,
sheet_name: None,
has_header: true,
}
}
}
impl ReadOptions {
#[must_use]
pub fn max_sample_rows(mut self, n: usize) -> Self {
self.max_sample_rows = n;
self
}
#[must_use]
pub fn sheet_name(mut self, name: impl Into<String>) -> Self {
self.sheet_name = Some(name.into());
self
}
#[must_use]
pub fn has_header(mut self, has_header: bool) -> Self {
self.has_header = has_header;
self
}
}
pub trait Reader: Send + Sync {
fn extensions(&self) -> &[&'static str];
fn read(&self, path: &Path, options: &ReadOptions) -> Result<Table>;
fn name(&self) -> &'static str {
std::any::type_name::<Self>()
}
}
pub struct Engine {
readers: Vec<Box<dyn Reader>>,
}
impl Engine {
#[must_use]
pub fn new() -> Self {
Self {
readers: Vec::new(),
}
}
#[must_use]
pub fn with_defaults() -> Self {
#[allow(unused_mut)]
let mut engine = Self::new();
#[cfg(feature = "calamine")]
{
engine.register(Box::new(CalamineReader::new()));
}
#[cfg(feature = "csv")]
{
engine.register(Box::new(CsvReader::new()));
}
#[cfg(feature = "parquet")]
{
engine.register(Box::new(ParquetReader::new()));
}
engine
}
pub fn register(&mut self, reader: Box<dyn Reader>) -> &mut Self {
self.readers.push(reader);
self
}
#[must_use]
pub fn len(&self) -> usize {
self.readers.len()
}
#[must_use]
pub fn is_empty(&self) -> bool {
self.readers.is_empty()
}
pub fn read(&self, path: &Path, options: &ReadOptions) -> Result<Table> {
let ext = extension_of(path).ok_or_else(|| {
Error::UnsupportedFormat(format!("no file extension on {}", path.display()))
})?;
let reader = self
.find(&ext)
.ok_or_else(|| Error::UnsupportedFormat(format!("no reader registered for .{ext}")))?;
reader.read(path, options)
}
fn find(&self, ext: &str) -> Option<&dyn Reader> {
self.readers
.iter()
.find(|r| r.extensions().contains(&ext))
.map(std::convert::AsRef::as_ref)
}
}
impl Default for Engine {
fn default() -> Self {
Self::with_defaults()
}
}
fn extension_of(path: &Path) -> Option<String> {
path.extension()
.and_then(|os| os.to_str())
.map(str::to_ascii_lowercase)
}
#[cfg_attr(
not(any(feature = "calamine", feature = "csv", feature = "parquet")),
allow(dead_code)
)]
pub(crate) fn infer_column_type(samples: &[Value]) -> (DataType, bool) {
let mut current: Option<DataType> = None;
let mut nullable = false;
for v in samples {
match v.data_type() {
None => nullable = true,
Some(t) => {
current = Some(promote(current, t));
}
}
}
(current.unwrap_or(DataType::Unknown), nullable)
}
fn promote(current: Option<DataType>, new: DataType) -> DataType {
match (current, new) {
(None, t) => t,
(Some(c), t) if c == t => c,
(Some(DataType::Integer), DataType::Float) | (Some(DataType::Float), DataType::Integer) => {
DataType::Float
}
(Some(DataType::Date), DataType::DateTime) | (Some(DataType::DateTime), DataType::Date) => {
DataType::DateTime
}
_ => DataType::Text,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_engine_rejects_all_files() {
let engine = Engine::new();
let result = engine.read(Path::new("anything.xlsx"), &ReadOptions::default());
assert!(matches!(result, Err(Error::UnsupportedFormat(_))));
}
#[test]
fn missing_extension_is_a_clean_error() {
let engine = Engine::with_defaults();
let result = engine.read(Path::new("/no-extension"), &ReadOptions::default());
assert!(matches!(result, Err(Error::UnsupportedFormat(_))));
}
#[test]
fn read_options_builders_chain() {
let opts = ReadOptions::default()
.max_sample_rows(50)
.sheet_name("Q1")
.has_header(false);
assert_eq!(opts.max_sample_rows, 50);
assert_eq!(opts.sheet_name.as_deref(), Some("Q1"));
assert!(!opts.has_header);
}
#[test]
fn infer_all_integers_yields_integer_not_nullable() {
let samples = vec![Value::Integer(1), Value::Integer(2), Value::Integer(3)];
assert_eq!(infer_column_type(&samples), (DataType::Integer, false));
}
#[test]
fn infer_int_plus_float_promotes_to_float() {
let samples = vec![Value::Integer(1), Value::Float(2.5)];
assert_eq!(infer_column_type(&samples), (DataType::Float, false));
}
#[test]
fn infer_int_plus_text_falls_back_to_text() {
let samples = vec![Value::Integer(1), Value::Text("two".into())];
assert_eq!(infer_column_type(&samples), (DataType::Text, false));
}
#[test]
fn infer_with_null_marks_nullable() {
let samples = vec![Value::Integer(1), Value::Null, Value::Integer(2)];
assert_eq!(infer_column_type(&samples), (DataType::Integer, true));
}
#[test]
fn infer_all_null_is_unknown() {
let samples = vec![Value::Null, Value::Null];
assert_eq!(infer_column_type(&samples), (DataType::Unknown, true));
}
#[test]
fn empty_samples_default_to_unknown() {
assert_eq!(infer_column_type(&[]), (DataType::Unknown, false));
}
}