Function polars_io::csv::utils::infer_file_schema

source ·

pub fn infer_file_schema(
    reader_bytes: &ReaderBytes<'_>,
    delimiter: u8,
    max_read_lines: Option<usize>,
    has_header: bool,
    schema_overwrite: Option<&Schema>,
    skip_rows: &mut usize,
    skip_rows_after_header: usize,
    comment_char: Option<u8>,
    quote_char: Option<u8>,
    eol_char: u8,
    null_values: Option<&NullValues>,
    parse_dates: bool
) -> PolarsResult<(Schema, usize, usize)>

Available on crate feature csv-file only.

Expand description

Infer the schema of a CSV file by reading through the first n records of the file, with max_read_records controlling the maximum number of records to read.

If max_read_records is not set, the whole file is read to infer its schema.

Returns - inferred schema - number of rows used for inference. - bytes read

Examples found in repository ?

src/csv/read.rs (lines 429-442)

    pub fn batched(mut self, schema: Option<SchemaRef>) -> PolarsResult<OwnedBatchedCsvReader> {
        match schema {
            Some(schema) => Ok(to_batched_owned(self, schema)),
            None => {
                let reader_bytes = get_reader_bytes(&mut self.reader)?;

                let (inferred_schema, _, _) = infer_file_schema(
                    &reader_bytes,
                    self.delimiter.unwrap_or(b','),
                    self.max_records,
                    self.has_header,
                    None,
                    &mut self.skip_rows_before_header,
                    self.skip_rows_after_header,
                    self.comment_char,
                    self.quote_char,
                    self.eol_char,
                    self.null_values.as_ref(),
                    self.parse_dates,
                )?;
                let schema = Arc::new(inferred_schema);
                Ok(to_batched_owned(self, schema))
            }
        }
    }

More examples

Hide additional examples

src/csv/read_impl/mod.rs (lines 218-231)

    pub(crate) fn new(
        reader_bytes: ReaderBytes<'a>,
        n_rows: Option<usize>,
        mut skip_rows: usize,
        mut projection: Option<Vec<usize>>,
        max_records: Option<usize>,
        delimiter: Option<u8>,
        has_header: bool,
        ignore_parser_errors: bool,
        schema: Option<&'a Schema>,
        columns: Option<Vec<String>>,
        encoding: CsvEncoding,
        n_threads: Option<usize>,
        schema_overwrite: Option<&'a Schema>,
        dtype_overwrite: Option<&'a [DataType]>,
        sample_size: usize,
        chunk_size: usize,
        low_memory: bool,
        comment_char: Option<u8>,
        quote_char: Option<u8>,
        eol_char: u8,
        null_values: Option<NullValues>,
        predicate: Option<Arc<dyn PhysicalIoExpr>>,
        to_cast: Vec<Field>,
        skip_rows_after_header: usize,
        row_count: Option<RowCount>,
        parse_dates: bool,
    ) -> PolarsResult<CoreReader<'a>> {
        #[cfg(any(feature = "decompress", feature = "decompress-fast"))]
        let mut reader_bytes = reader_bytes;

        #[cfg(not(any(feature = "decompress", feature = "decompress-fast")))]
        if is_compressed(&reader_bytes) {
            return Err(PolarsError::ComputeError("cannot read compressed csv file; compile with feature 'decompress' or 'decompress-fast'".into()));
        }

        // check if schema should be inferred
        let delimiter = delimiter.unwrap_or(b',');

        let mut schema = match schema {
            Some(schema) => Cow::Borrowed(schema),
            None => {
                {
                    // We keep track of the inferred schema bool
                    // In case the file is compressed this schema inference is wrong and has to be done
                    // again after decompression.
                    #[cfg(any(feature = "decompress", feature = "decompress-fast"))]
                    if let Some(b) =
                        decompress(&reader_bytes, n_rows, delimiter, quote_char, eol_char)
                    {
                        reader_bytes = ReaderBytes::Owned(b);
                    }

                    let (inferred_schema, _, _) = infer_file_schema(
                        &reader_bytes,
                        delimiter,
                        max_records,
                        has_header,
                        schema_overwrite,
                        &mut skip_rows,
                        skip_rows_after_header,
                        comment_char,
                        quote_char,
                        eol_char,
                        null_values.as_ref(),
                        parse_dates,
                    )?;
                    Cow::Owned(inferred_schema)
                }
            }
        };
        if let Some(dtypes) = dtype_overwrite {
            let mut s = schema.into_owned();
            for (index, dt) in dtypes.iter().enumerate() {
                s.coerce_by_index(index, dt.clone()).unwrap();
            }
            schema = Cow::Owned(s);
        }

        // create a null value for every column
        let mut null_values = null_values.map(|nv| nv.compile(&schema)).transpose()?;

        if let Some(cols) = columns {
            let mut prj = Vec::with_capacity(cols.len());
            for col in cols {
                let i = schema.try_index_of(&col)?;
                prj.push(i);
            }

            // update null values with projection
            if let Some(nv) = null_values.as_mut() {
                nv.apply_projection(&prj);
            }

            projection = Some(prj);
        }

        Ok(CoreReader {
            reader_bytes: Some(reader_bytes),
            schema,
            projection,
            line_number: usize::from(has_header),
            ignore_parser_errors,
            skip_rows_before_header: skip_rows,
            skip_rows_after_header,
            n_rows,
            encoding,
            n_threads,
            has_header,
            delimiter,
            sample_size,
            chunk_size,
            low_memory,
            comment_char,
            quote_char,
            eol_char,
            null_values,
            predicate,
            to_cast,
            row_count,
        })
    }

src/csv/utils.rs (lines 285-298)

pub fn infer_file_schema(
    reader_bytes: &ReaderBytes,
    delimiter: u8,
    max_read_lines: Option<usize>,
    has_header: bool,
    schema_overwrite: Option<&Schema>,
    // we take &mut because we maybe need to skip more rows dependent
    // on the schema inference
    skip_rows: &mut usize,
    skip_rows_after_header: usize,
    comment_char: Option<u8>,
    quote_char: Option<u8>,
    eol_char: u8,
    null_values: Option<&NullValues>,
    parse_dates: bool,
) -> PolarsResult<(Schema, usize, usize)> {
    // keep track so that we can determine the amount of bytes read
    let start_ptr = reader_bytes.as_ptr() as usize;

    // We use lossy utf8 here because we don't want the schema inference to fail on utf8.
    // It may later.
    let encoding = CsvEncoding::LossyUtf8;

    let bytes = skip_line_ending(skip_bom(reader_bytes), eol_char);
    if bytes.is_empty() {
        return Err(PolarsError::NoData("empty csv".into()));
    }
    let mut lines = SplitLines::new(bytes, quote_char.unwrap_or(b'"'), eol_char).skip(*skip_rows);
    // it can be that we have a single line without eol char
    let has_eol = bytes.contains(&eol_char);

    // get or create header names
    // when has_header is false, creates default column names with column_ prefix

    // skip lines that are comments
    let mut first_line = None;
    if let Some(comment_ch) = comment_char {
        for (i, line) in (&mut lines).enumerate() {
            if let Some(ch) = line.first() {
                if *ch != comment_ch {
                    first_line = Some(line);
                    *skip_rows += i;
                    break;
                }
            }
        }
    } else {
        first_line = lines.next();
    }
    // edge case where we have a single row, no header and no eol char.
    if first_line.is_none() && !has_eol && !has_header {
        first_line = Some(bytes);
    }

    // now that we've found the first non-comment line we parse the headers, or we create a header
    let headers: Vec<String> = if let Some(mut header_line) = first_line {
        let len = header_line.len();
        if len > 1 {
            // remove carriage return
            let trailing_byte = header_line[len - 1];
            if trailing_byte == b'\r' {
                header_line = &header_line[..len - 1];
            }
        }

        let byterecord = SplitFields::new(header_line, delimiter, quote_char, eol_char);
        if has_header {
            let headers = byterecord
                .map(|(slice, needs_escaping)| {
                    let slice_escaped = if needs_escaping && (slice.len() >= 2) {
                        &slice[1..(slice.len() - 1)]
                    } else {
                        slice
                    };
                    let s = parse_bytes_with_encoding(slice_escaped, encoding)?;
                    Ok(s)
                })
                .collect::<PolarsResult<Vec<_>>>()?;

            let mut final_headers = Vec::with_capacity(headers.len());

            let mut header_names = PlHashMap::with_capacity(headers.len());

            for name in &headers {
                let count = header_names.entry(name.as_ref()).or_insert(0usize);
                if *count != 0 {
                    final_headers.push(format!("{}_duplicated_{}", name, *count - 1))
                } else {
                    final_headers.push(name.to_string())
                }
                *count += 1;
            }
            final_headers
        } else {
            let mut column_names: Vec<String> = byterecord
                .enumerate()
                .map(|(i, _s)| format!("column_{}", i + 1))
                .collect();
            // needed because SplitLines does not return the \n char, so SplitFields does not catch
            // the latest value if ending with a delimiter.
            if header_line.ends_with(&[delimiter]) {
                column_names.push(format!("column_{}", column_names.len() + 1))
            }
            column_names
        }
    } else if has_header && !bytes.is_empty() {
        // there was no new line char. So we copy the whole buf and add one
        // this is likely to be cheap as there are no rows.
        let mut buf = Vec::with_capacity(bytes.len() + 2);
        buf.extend_from_slice(bytes);
        buf.push(eol_char);

        return infer_file_schema(
            &ReaderBytes::Owned(buf),
            delimiter,
            max_read_lines,
            has_header,
            schema_overwrite,
            skip_rows,
            skip_rows_after_header,
            comment_char,
            quote_char,
            eol_char,
            null_values,
            parse_dates,
        );
    } else {
        return Err(PolarsError::NoData("empty csv".into()));
    };
    if !has_header {
        // re-init lines so that the header is included in type inference.
        lines = SplitLines::new(bytes, quote_char.unwrap_or(b'"'), eol_char).skip(*skip_rows);
    }

    let header_length = headers.len();
    // keep track of inferred field types
    let mut column_types: Vec<PlHashSet<DataType>> =
        vec![PlHashSet::with_capacity(4); header_length];
    // keep track of columns with nulls
    let mut nulls: Vec<bool> = vec![false; header_length];

    let mut rows_count = 0;
    let mut fields = Vec::with_capacity(header_length);

    // needed to prevent ownership going into the iterator loop
    let records_ref = &mut lines;

    let mut end_ptr = start_ptr;
    for mut line in records_ref
        .take(max_read_lines.unwrap_or(usize::MAX))
        .skip(skip_rows_after_header)
    {
        rows_count += 1;
        // keep track so that we can determine the amount of bytes read
        end_ptr = line.as_ptr() as usize + line.len();

        if let Some(c) = comment_char {
            // line is a comment -> skip
            if line[0] == c {
                continue;
            }
        }

        let len = line.len();
        if len > 1 {
            // remove carriage return
            let trailing_byte = line[len - 1];
            if trailing_byte == b'\r' {
                line = &line[..len - 1];
            }
        }

        let mut record = SplitFields::new(line, delimiter, quote_char, eol_char);

        for i in 0..header_length {
            if let Some((slice, needs_escaping)) = record.next() {
                if slice.is_empty() {
                    nulls[i] = true;
                } else {
                    let slice_escaped = if needs_escaping && (slice.len() >= 2) {
                        &slice[1..(slice.len() - 1)]
                    } else {
                        slice
                    };
                    let s = parse_bytes_with_encoding(slice_escaped, encoding)?;
                    match &null_values {
                        None => {
                            column_types[i].insert(infer_field_schema(&s, parse_dates));
                        }
                        Some(NullValues::AllColumns(names)) => {
                            if !names.iter().any(|nv| nv == s.as_ref()) {
                                column_types[i].insert(infer_field_schema(&s, parse_dates));
                            }
                        }
                        Some(NullValues::AllColumnsSingle(name)) => {
                            if s.as_ref() != name {
                                column_types[i].insert(infer_field_schema(&s, parse_dates));
                            }
                        }
                        Some(NullValues::Named(names)) => {
                            let current_name = &headers[i];
                            let null_name = &names.iter().find(|name| &name.0 == current_name);

                            if let Some(null_name) = null_name {
                                if null_name.1 != s.as_ref() {
                                    column_types[i].insert(infer_field_schema(&s, parse_dates));
                                }
                            } else {
                                column_types[i].insert(infer_field_schema(&s, parse_dates));
                            }
                        }
                    }
                }
            }
        }
    }

    // build schema from inference results
    for i in 0..header_length {
        let possibilities = &column_types[i];
        let field_name = &headers[i];

        if let Some(schema_overwrite) = schema_overwrite {
            if let Some((_, name, dtype)) = schema_overwrite.get_full(field_name) {
                fields.push(Field::new(name, dtype.clone()));
                continue;
            }
        }

        // determine data type based on possible types
        // if there are incompatible types, use DataType::Utf8
        match possibilities.len() {
            1 => {
                for dtype in possibilities.iter() {
                    fields.push(Field::new(field_name, dtype.clone()));
                }
            }
            2 => {
                if possibilities.contains(&DataType::Int64)
                    && possibilities.contains(&DataType::Float64)
                {
                    // we have an integer and double, fall down to double
                    fields.push(Field::new(field_name, DataType::Float64));
                }
                // prefer a datelike parse above a no parse so choose the date type
                else if possibilities.contains(&DataType::Utf8)
                    && possibilities.contains(&DataType::Date)
                {
                    fields.push(Field::new(field_name, DataType::Date));
                }
                // prefer a datelike parse above a no parse so choose the date type
                else if possibilities.contains(&DataType::Utf8)
                    && possibilities.contains(&DataType::Datetime(TimeUnit::Microseconds, None))
                {
                    fields.push(Field::new(
                        field_name,
                        DataType::Datetime(TimeUnit::Microseconds, None),
                    ));
                } else {
                    // default to Utf8 for conflicting datatypes (e.g bool and int)
                    fields.push(Field::new(field_name, DataType::Utf8));
                }
            }
            _ => fields.push(Field::new(field_name, DataType::Utf8)),
        }
    }
    // if there is a single line after the header without an eol
    // we copy the bytes add an eol and rerun this function
    // so that the inference is consistent with and without eol char
    if rows_count == 0 && reader_bytes[reader_bytes.len() - 1] != eol_char {
        let mut rb = Vec::with_capacity(reader_bytes.len() + 1);
        rb.extend_from_slice(reader_bytes);
        rb.push(eol_char);
        return infer_file_schema(
            &ReaderBytes::Owned(rb),
            delimiter,
            max_read_lines,
            has_header,
            schema_overwrite,
            skip_rows,
            skip_rows_after_header,
            comment_char,
            quote_char,
            eol_char,
            null_values,
            parse_dates,
        );
    }

    Ok((
        Schema::from(fields.into_iter()),
        rows_count,
        end_ptr - start_ptr,
    ))
}

Function polars_io::csv::utils::infer_file_schema

Examples found in repository?

Examples found in repository ?