Skip to main content

polars_python/io/
scan_options.rs

1use std::sync::Arc;
2
3use polars::prelude::default_values::DefaultFieldValues;
4use polars::prelude::deletion::DeletionFilesList;
5use polars::prelude::{
6    CastColumnsPolicy, CloudScheme, ColumnMapping, ExtraColumnsPolicy, MissingColumnsPolicy,
7    PlSmallStr, Schema, TableStatistics, UnifiedScanArgs,
8};
9use polars_io::{HiveOptions, RowIndex};
10use polars_utils::IdxSize;
11use polars_utils::slice_enum::Slice;
12use pyo3::intern;
13use pyo3::prelude::*;
14use pyo3::pybacked::PyBackedStr;
15
16use crate::PyDataFrame;
17use crate::io::cloud_options::OptPyCloudOptions;
18use crate::prelude::Wrap;
19
20/// Interface to `class ScanOptions` on the Python side
21pub struct PyScanOptions<'py>(Bound<'py, PyAny>);
22
23impl<'a, 'py> FromPyObject<'a, 'py> for PyScanOptions<'py> {
24    type Error = PyErr;
25
26    fn extract(ob: Borrowed<'a, 'py, PyAny>) -> PyResult<Self> {
27        Ok(Self(ob.to_owned()))
28    }
29}
30
31impl<'a, 'py> FromPyObject<'a, 'py> for Wrap<TableStatistics> {
32    type Error = PyErr;
33
34    fn extract(ob: Borrowed<'a, 'py, PyAny>) -> PyResult<Self> {
35        let py = ob.py();
36        let attr = ob.getattr(intern!(py, "_df"))?;
37        Ok(Wrap(TableStatistics(Arc::new(
38            PyDataFrame::extract(attr.as_borrowed())?.df.into_inner(),
39        ))))
40    }
41}
42
43impl PyScanOptions<'_> {
44    pub fn extract_unified_scan_args(
45        &self,
46        cloud_scheme: Option<CloudScheme>,
47    ) -> PyResult<UnifiedScanArgs> {
48        #[derive(FromPyObject)]
49        struct Extract<'a> {
50            row_index: Option<(Wrap<PlSmallStr>, IdxSize)>,
51            pre_slice: Option<(i64, usize)>,
52            cast_options: Wrap<CastColumnsPolicy>,
53            extra_columns: Wrap<ExtraColumnsPolicy>,
54            missing_columns: Wrap<MissingColumnsPolicy>,
55            include_file_paths: Option<Wrap<PlSmallStr>>,
56            glob: bool,
57            hidden_file_prefix: Option<Vec<PyBackedStr>>,
58            column_mapping: Option<Wrap<ColumnMapping>>,
59            default_values: Option<Wrap<DefaultFieldValues>>,
60            hive_partitioning: Option<bool>,
61            hive_schema: Option<Wrap<Schema>>,
62            try_parse_hive_dates: bool,
63            rechunk: bool,
64            cache: bool,
65            storage_options: OptPyCloudOptions<'a>,
66            credential_provider: Option<Py<PyAny>>,
67            deletion_files: Option<Wrap<DeletionFilesList>>,
68            table_statistics: Option<Wrap<TableStatistics>>,
69            row_count: Option<(u64, u64)>,
70        }
71
72        let Extract {
73            row_index,
74            pre_slice,
75            cast_options,
76            extra_columns,
77            missing_columns,
78            include_file_paths,
79            column_mapping,
80            default_values,
81            glob,
82            hidden_file_prefix,
83            hive_partitioning,
84            hive_schema,
85            try_parse_hive_dates,
86            rechunk,
87            cache,
88            storage_options,
89            credential_provider,
90            deletion_files,
91            table_statistics,
92            row_count,
93        } = self.0.extract()?;
94
95        let cloud_options =
96            storage_options.extract_opt_cloud_options(cloud_scheme, credential_provider)?;
97
98        let hive_schema = hive_schema.map(|s| Arc::new(s.0));
99
100        let row_index = row_index.map(|(name, offset)| RowIndex {
101            name: name.0,
102            offset,
103        });
104
105        let hive_options = HiveOptions {
106            enabled: hive_partitioning,
107            hive_start_idx: 0,
108            schema: hive_schema,
109            try_parse_dates: try_parse_hive_dates,
110        };
111
112        let unified_scan_args = UnifiedScanArgs {
113            // Schema is currently still stored inside the options per scan type, but we do eventually
114            // want to put it here instead.
115            schema: None,
116            cloud_options,
117            hive_options,
118            rechunk,
119            cache,
120            glob,
121            hidden_file_prefix: hidden_file_prefix
122                .map(|x| x.into_iter().map(|x| (*x).into()).collect()),
123            projection: None,
124            column_mapping: column_mapping.map(|x| x.0),
125            default_values: default_values
126                .map(|x| x.0)
127                .filter(|DefaultFieldValues::Iceberg(v)| !v.is_empty()),
128            row_index,
129            pre_slice: pre_slice.map(Slice::from),
130            cast_columns_policy: cast_options.0,
131            missing_columns_policy: missing_columns.0,
132            extra_columns_policy: extra_columns.0,
133            include_file_paths: include_file_paths.map(|x| x.0),
134            deletion_files: DeletionFilesList::filter_empty(deletion_files.map(|x| x.0)),
135            table_statistics: table_statistics.map(|x| x.0),
136            row_count,
137        };
138
139        Ok(unified_scan_args)
140    }
141}