tabiew 0.13.1 - Docs.rs

use std::{
    collections::HashSet,
    fs::read_to_string,
    io::{Cursor, Read},
    iter::once,
};

use fwf_rs::Reader;
use itertools::Itertools;
use polars::{frame::DataFrame, prelude::Column};

use crate::{
    AppResult,
    args::Args,
    io::reader::ReaderSource,
    misc::{iter_ext::ZipItersExt, stdin::stdin, table_name_generator::TableNameGeneratorExt},
};

use super::{DataFrameReader, NamedFrames};

#[derive(Debug)]
pub struct FwfToDataFrame {
    widths: Vec<usize>,
    has_header: bool,
    separator_length: usize,
    flexible_width: bool,
}

impl FwfToDataFrame {
    pub fn from_args(args: &Args) -> Self {
        Self {
            widths: parse_width(&args.widths).unwrap_or_default(),
            has_header: !args.no_header,
            separator_length: args.separator_length,
            flexible_width: !args.no_flexible_width,
        }
    }

    pub fn with_widths(mut self, widths: Vec<usize>) -> Self {
        self.widths = widths;
        self
    }

    pub fn with_has_header(mut self, has_header: bool) -> Self {
        self.has_header = has_header;
        self
    }

    pub fn with_separator_length(mut self, separator_length: usize) -> Self {
        self.separator_length = separator_length;
        self
    }

    pub fn with_flexible_width(mut self, flexible_width: bool) -> Self {
        self.flexible_width = flexible_width;
        self
    }
}

impl Default for FwfToDataFrame {
    fn default() -> Self {
        Self {
            widths: Vec::default(),
            has_header: true,
            separator_length: 0,
            flexible_width: true,
        }
    }
}

impl DataFrameReader for FwfToDataFrame {
    fn read_to_data_frames(&self, input: ReaderSource) -> AppResult<NamedFrames> {
        let file_content = match &input {
            ReaderSource::File(path) => read_to_string(path)?,
            ReaderSource::Stdin => {
                let mut buf = String::new();
                stdin().read_to_string(&mut buf)?;
                buf
            }
        };

        let widths = if self.widths.is_empty() {
            let common_space_indices = file_content
                .lines()
                .map(|line| line.trim())
                .filter(|line| !line.is_empty())
                .map(|line| {
                    let length = line.chars().count();
                    let spaces = line
                        .chars()
                        .enumerate()
                        .filter_map(|(i, c)| c.is_whitespace().then_some(i))
                        .collect::<HashSet<usize>>();
                    (length, spaces)
                })
                .reduce(|(la, sa), (lb, sb)| (la.max(lb), sa.intersection(&sb).copied().collect()))
                .map(|(len, idx_set)| idx_set.into_iter().chain(once(len)).sorted().collect_vec())
                .unwrap_or_default();
            infer_widths(common_space_indices)
        } else {
            self.widths.clone()
        };
        let reader = Reader::new(
            Cursor::new(file_content),
            widths.clone(),
            self.separator_length,
            self.flexible_width,
            self.has_header,
        )?;
        let header = reader
            .header()
            .map(|rec| {
                rec.iter().fold(Vec::new(), |mut vec, slice| {
                    if let Some(name) = slice.snake_case_names().find(|name| !vec.contains(name)) {
                        vec.push(name);
                    } else {
                        vec.push(format!("column_{}", vec.len() + 1));
                    }
                    vec
                })
            })
            .unwrap_or_else(|| {
                (0..widths.len())
                    .map(|idx| format!("column_{}", idx + 1))
                    .collect_vec()
            });

        let columns = reader
            .records()
            .filter_map(Result::ok)
            .map(|record| {
                record
                    .iter()
                    .map(str::trim)
                    .map(ToOwned::to_owned)
                    .collect_vec()
                    .into_iter()
            })
            .zip_iters()
            .collect_vec();

        let df = DataFrame::new_infer_height(
            header
                .into_iter()
                .zip(columns)
                .map(|(name, vals)| Column::new(name.into(), vals))
                .collect(),
        )?;

        Ok([(input.table_name(), df)].into())
    }
}

fn parse_width(widths: impl AsRef<str>) -> AppResult<Vec<usize>> {
    Ok(widths
        .as_ref()
        .split(',')
        .map(|w| w.parse::<usize>())
        .collect::<Result<Vec<_>, _>>()?)
}

fn infer_widths(space_indices: Vec<usize>) -> Vec<usize> {
    let mut indices = Vec::default();
    let mut start = 0;
    for (i, idx) in space_indices.iter().enumerate() {
        if let Some(nidx) = space_indices.get(i + 1) {
            if nidx - idx > 1 {
                indices.push(idx - start);
                start = idx + 1
            }
        } else {
            indices.push(idx - start);
        }
    }
    indices
}