excel2df 0.1.5

A library for converting Excel files to Polars DataFrame.It supports multiple threads to improve performance.
Documentation
use std::sync::{atomic::{AtomicUsize, Ordering}, Arc};

use polars::prelude::{AnyValue, TimeUnit};
use quick_xml::{events::{attributes::{Attribute, Attributes}, Event}, name::QName};
use rayon::prelude::*;
use crate::{cellvalue::{Cell,CellFormat}, wb::DateSystem};

const XL_MAX_COL: u32 = 16384;
const XL_MIN_COL: u32 = 1;



#[inline]
pub fn get_attribute(attrs: Attributes, which: &[u8]) -> Option<String> {
    for attr in attrs {
        let a = attr.unwrap();
        if a.key == QName(which) {
            return Some(attr_value(&a))
        }
    }
    None
}

#[inline]
pub fn attr_value(a: &Attribute) -> String {
    String::from_utf8(a.value.to_vec()).unwrap()
}

pub fn used_area(used_area_range: &str) -> (u32, u32) {//(row, col)
    let mut end: isize = -1;
    for (i, c) in used_area_range.chars().enumerate() {
        if c == ':' { end = i as isize; break }
    }
    if end == -1 {
        (0, 0)
    } else {
        let end_range = &used_area_range[end as usize..];
        let mut end = 0;
        for (i, c) in end_range[1..].chars().enumerate() {
            if !c.is_ascii_alphabetic() {
                end = i + 1;
                break
            }
        }
        let col = col2num(&end_range[1..end]).unwrap();
        let row: u32 = end_range[end..].parse().unwrap();
        (row, col)
    }
}

#[inline]
pub fn col2num(letter: &str) -> Option<u32> {
    let letter = letter.to_uppercase();
    let mut num: u32 = 0;
    for c in letter.chars() {
        if !('A'..='Z').contains(&c) { return None }
        num = num * 26 + ((c as u32) - ('A' as u32)) + 1;
    }
    if !(XL_MIN_COL..=XL_MAX_COL).contains(&num) { return None }
    Some(num)
}

pub fn reference2pos(reference: &str) -> Result<(usize, usize), String> {
    let mut col: usize = 0;
    let mut row: usize = 0;
    let mut iter = reference.chars();
    while let Some(c) = iter.next() {
        if c.is_ascii_alphabetic() {
            let col_value = c.to_ascii_uppercase() as usize - 'A' as usize + 1;
            if col.checked_mul(26).and_then(|x| x.checked_add(col_value)).is_none() {
                return Err("Column index overflow".to_string());
            }
            col = col * 26 + col_value;
        } else {
            let row_value = c.to_digit(10).unwrap() as usize;
            if row.checked_mul(10).and_then(|x| x.checked_add(row_value)).is_none() {
                return Err("Row index overflow".to_string());
            }
            row = row * 10 + row_value;
        }
    }
    Ok((row, col))
}

/// The end of a row in an Sheet XML file.
/// It is used to find the end of a row in an XML file.
pub const ROW_END: &str = "</row>";

/// A chunker for XML data.
/// It splits the XML data into chunks of a specified size.
/// It use multi-thread to speed up the process.
pub struct XmlChunker<'a> {
    data: &'a[u8],
    cursor: AtomicUsize,
    chunk_size: usize,
    chunk_end:&'a str,
}

impl <'a>XmlChunker<'a> {
    pub fn new(data: &'a [u8],chunks:usize, chunk_end:&'a str) -> Self {
        Self {
            data,
            cursor: AtomicUsize::new(0),
            chunk_size:data.len()/chunks,
            chunk_end,
        }
    }

    fn next_chunk(&self) -> Option<String> {
        let start = self.cursor.load(Ordering::Acquire);
        if start >= self.data.len() {
            return None;
        }

        let end = (start + self.chunk_size).min(self.data.len());
        let mut buffer = String::with_capacity(self.chunk_size * 2);
        
        let mut stack = 0;
        let mut last_end = start;

        for i in (start..end).rev(){
            // println!("last_end:{},i:{}",last_end,i);
            if i> start{
                if i <self.chunk_end.len() {
                    return None;
                }
                let window = &self.data[i-self.chunk_end.len()..i];
                if window == self.chunk_end.as_bytes() {
                    last_end = i;
                    break;
                } 
            }else{
                return None;
            }
        }

        if last_end > start {
            buffer.push_str(
                std::str::from_utf8(&self.data[start..last_end])
                    .unwrap_or_default(),
            );
            self.cursor.store(last_end, Ordering::Release);
            Some(buffer)
        } else {
            None
        }
    }

    pub fn chunks(&self) ->Vec<String>{
        let mut chunks = Vec::new();
        while let Some(chunk) = self.next_chunk() {
            chunks.push(chunk);
        }
        // println!("chunks:{:?}",&chunks);
        chunks
    }
}

pub fn get_cell_data_from_chunk<'a>(
    chunk: &str,
    share_strings:Arc<Vec<String>>,
    styles: Arc<Vec<CellFormat>>,
    date_system: Arc<DateSystem>
) -> Option<Vec<Cell<'a>>> {
    let mut cells: Vec<Cell<'a>> = Vec::with_capacity(2000);
    let mut reader = quick_xml::Reader::from_str(chunk);
    let mut buf = Vec::with_capacity(1024);
    let mut in_value = false;
    let mut is_string = false;
    let mut is_inlinstring = false;
    let mut each_col_count =0;

    let mut cell_row = 0;
    let mut cell_col = 0;
    let mut cell_format = CellFormat::Number;

    let mut cell_value = AnyValue::Null;

    loop {
        match reader.read_event_into(&mut buf) {
            Ok(Event::Start(ref e)) if e.name().as_ref() == b"c" => {
                e.attributes().for_each(|a| {
                    let a = a.unwrap();
                    match a.key.as_ref() {
                        b"r" => {
                            let reference = attr_value(&a);
                            if let Ok((row, col)) = reference2pos(reference.as_ref()) {
                                cell_row = row;
                                cell_col = col;
                            }
                        }
                        b"t" => {
                            let t = attr_value(&a);
                            if t == "s" || t == "str" || t == "inlineStr" {
                                is_string = true;
                            }
                            if t =="inlineStr"{
                                is_inlinstring = true;
                            }
                        }
                        b"s" => {
                            let s = attr_value(&a);
                            cell_format = *styles.get(s.parse::<usize>().unwrap()).unwrap();
                        }
                        _ => {}
                    }
                });
            }
            Ok(Event::End(ref e)) if e.name().as_ref() == b"c" => {
                // cells.push(Cell{value:cell_value.to_owned(),pos:(cell_row,cell_col)});
            }

            Ok(Event::End(ref e)) if e.name().as_ref() == b"row" => {
            }

            Ok(Event::Start(ref e)) if e.name().as_ref() == b"v"||e.name().as_ref() == b"t" => {
                in_value = true;
            }
            Ok(Event::Text(ref e)) if in_value => {
                in_value = false;
                let raw_value = &e.unescape().unwrap()[..];
                if is_string {
                    if is_inlinstring{
                        cell_value = AnyValue::StringOwned(raw_value.to_string().into());
                        cells.push(Cell{value:cell_value,pos:(cell_row,cell_col)});
                    }else{
                        let s = share_strings.get(raw_value.parse::<usize>().unwrap())
                        .map(|x| x.clone())
                        .unwrap_or_default();
                        cell_value = AnyValue::StringOwned(s.into());
                        cells.push(Cell{value:cell_value,pos:(cell_row,cell_col)});
                    }

                    is_string = false;
                    is_inlinstring = false;
                } else {
                    let num = raw_value.parse::<f64>().unwrap();
                    match cell_format {
                        CellFormat::Number => {
                            cell_value = AnyValue::Float64(num);
                            cells.push(Cell{value:cell_value,pos:(cell_row,cell_col)});
                        }
                        CellFormat::DateTime => {
                            let gap_days = match *date_system {
                                DateSystem::V1900 => 25569,
                                DateSystem::V1904 => 24109,
                            };
                            cell_value = AnyValue::Date(num as i32 - gap_days);
                            cells.push(Cell{value:cell_value,pos:(cell_row,cell_col)});
                        }
                        CellFormat::TimeDelta => {
                            let gap_days = match *date_system {
                                DateSystem::V1900 => 25569,
                                DateSystem::V1904 => 24109,
                            };

                            let milliseconds = ((num - gap_days as f64) * 86400000.0) as i64;
                            cell_value = AnyValue::Datetime(milliseconds, TimeUnit::Milliseconds, None);
                            cells.push(Cell{value:cell_value,pos:(cell_row,cell_col)});
                        }
                    }// match

                    cell_format = CellFormat::Number;
                }
            }
            Ok(Event::Eof) => break,
            Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
            _ => (),
        }
        buf.clear();
    }

    Some(cells)
}