tabiew 0.14.0

A lightweight TUI application to view and query tabular data files, such as CSV, TSV, and parquet.
use std::io::Read;

use polars::{
    frame::DataFrame,
    prelude::{AnyValue, Column},
};
use scraper::{ElementRef, Html, Selector};

use crate::{
    AppResult,
    args::Args,
    io::reader::{DataFrameReader, NamedFrames, ReaderSource},
    misc::stdin::stdin,
};

#[derive(Debug, Default)]
pub struct HtmlToDataFrame;

impl HtmlToDataFrame {
    pub fn from_args(_args: &Args) -> Self {
        Self
    }
}

impl DataFrameReader for HtmlToDataFrame {
    fn read_to_data_frames(&self, input: ReaderSource) -> AppResult<NamedFrames> {
        let contents = match &input {
            ReaderSource::File(path) => std::fs::read_to_string(path)?,
            ReaderSource::Stdin => {
                let mut s = String::new();
                stdin().read_to_string(&mut s)?;
                s
            }
        };

        let document = Html::parse_document(&contents);
        let table_selector = Selector::parse("table").unwrap();

        document
            .select(&table_selector)
            .enumerate()
            .map(|(idx, table)| {
                let df = table_to_data_frame(table)?;
                Ok((format!("html_table_{}", idx + 1), df))
            })
            .collect::<AppResult<Vec<_>>>()
            .map(Vec::into_boxed_slice)
    }
}

fn table_to_data_frame(table: ElementRef) -> AppResult<DataFrame> {
    let thead_th_selector = Selector::parse("thead th").unwrap();
    let tr_selector = Selector::parse("tr").unwrap();
    let cell_selector = Selector::parse("th, td").unwrap();

    let header_cells: Vec<String> = table.select(&thead_th_selector).map(cell_text).collect();

    let has_thead_headers = !header_cells.is_empty();

    let data_rows: Vec<Vec<String>> = table
        .select(&tr_selector)
        .filter(|tr| {
            if !has_thead_headers {
                return true;
            }
            !tr.ancestors().any(|node| {
                ElementRef::wrap(node)
                    .map(|e| e.value().name() == "thead")
                    .unwrap_or(false)
            })
        })
        .map(|tr| tr.select(&cell_selector).map(cell_text).collect())
        .collect();

    let column_count = if has_thead_headers {
        header_cells.len()
    } else {
        data_rows.iter().map(Vec::len).max().unwrap_or(0)
    };

    let headers: Vec<String> = if has_thead_headers {
        header_cells
    } else {
        (1..=column_count).map(|i| format!("column_{i}")).collect()
    };

    let mut columns: Vec<Vec<AnyValue<'static>>> = (0..column_count).map(|_| Vec::new()).collect();
    for row in data_rows {
        let mut row_iter = row.into_iter();
        for col in columns.iter_mut() {
            col.push(match row_iter.next() {
                Some(value) => AnyValue::StringOwned(value.into()),
                None => AnyValue::Null,
            });
        }
    }

    Ok(DataFrame::new_infer_height(
        headers
            .into_iter()
            .zip(columns)
            .map(|(name, values)| Column::new(name.into(), values))
            .collect(),
    )?)
}

fn cell_text(elem: ElementRef) -> String {
    elem.text().collect::<String>().trim().to_owned()
}