Skip to main content

polars_python/functions/
io.rs

1use std::io::BufReader;
2
3#[cfg(any(feature = "ipc", feature = "parquet"))]
4use polars::prelude::ArrowSchema;
5use polars::prelude::CloudScheme;
6use pyo3::prelude::*;
7use pyo3::types::PyDict;
8
9use crate::conversion::Wrap;
10use crate::error::PyPolarsErr;
11use crate::file::{EitherRustPythonFile, get_either_file};
12use crate::io::cloud_options::OptPyCloudOptions;
13
14#[cfg(feature = "ipc")]
15#[pyfunction]
16pub fn read_ipc_schema(py: Python<'_>, py_f: Py<PyAny>) -> PyResult<Bound<'_, PyDict>> {
17    use arrow::io::ipc::read::read_file_metadata;
18    let metadata = match get_either_file(py_f, false)? {
19        EitherRustPythonFile::Rust(r) => {
20            read_file_metadata(&mut BufReader::new(r)).map_err(PyPolarsErr::from)?
21        },
22        EitherRustPythonFile::Py(mut r) => read_file_metadata(&mut r).map_err(PyPolarsErr::from)?,
23    };
24
25    let dict = PyDict::new(py);
26    fields_to_pydict(&metadata.schema, &dict)?;
27    Ok(dict)
28}
29
30#[cfg(feature = "parquet")]
31#[pyfunction]
32pub fn read_parquet_metadata(
33    py: Python,
34    py_f: Py<PyAny>,
35    storage_options: OptPyCloudOptions,
36    credential_provider: Option<Py<PyAny>>,
37) -> PyResult<Py<PyDict>> {
38    use std::io::Cursor;
39
40    use polars_error::feature_gated;
41    use polars_parquet::read::read_metadata;
42    use polars_parquet::read::schema::read_custom_key_value_metadata;
43
44    use crate::file::{PythonScanSourceInput, get_python_scan_source_input};
45
46    let metadata = match get_python_scan_source_input(py_f, false)? {
47        PythonScanSourceInput::Buffer(buf) => {
48            read_metadata(&mut Cursor::new(buf)).map_err(PyPolarsErr::from)?
49        },
50        PythonScanSourceInput::Path(p) => {
51            let cloud_options = storage_options.extract_opt_cloud_options(
52                CloudScheme::from_path(p.as_str()),
53                credential_provider,
54            )?;
55
56            if p.has_scheme() {
57                feature_gated!("cloud", {
58                    use polars::prelude::ParquetObjectStore;
59                    use polars_error::PolarsResult;
60
61                    py.detach(|| {
62                        polars_core::runtime::ASYNC.block_on(async {
63                            let mut reader =
64                                ParquetObjectStore::from_uri(p, cloud_options.as_ref(), None)
65                                    .await?;
66                            let result = reader.get_metadata().await?;
67                            PolarsResult::Ok((**result).clone())
68                        })
69                    })
70                })
71                .map_err(PyPolarsErr::from)?
72            } else {
73                let file = polars_utils::open_file(p.as_std_path()).map_err(PyPolarsErr::from)?;
74                read_metadata(&mut BufReader::new(file)).map_err(PyPolarsErr::from)?
75            }
76        },
77        PythonScanSourceInput::File(f) => {
78            read_metadata(&mut BufReader::new(f)).map_err(PyPolarsErr::from)?
79        },
80    };
81
82    let key_value_metadata = read_custom_key_value_metadata(metadata.key_value_metadata());
83    let dict = PyDict::new(py);
84    for (key, value) in key_value_metadata.into_iter() {
85        dict.set_item(key.as_str(), value.as_str())?;
86    }
87    Ok(dict.unbind())
88}
89
90/// Decode a parquet footer, optionally apply `FileMetadata::pruned`, then
91/// bincode-encode and return the byte length of the wire form.
92///
93/// Exposed for out-of-tree measurement of the IR-plan-borne metadata wire
94/// form (the `bincode(FileMetadata)` blob shipped to workers in distributed
95/// execution); no caller in py-polars itself.
96///
97/// `projection = None` ⇒ encode the full `FileMetadata`. `projection =
98/// Some(cols)` ⇒ apply `pruned(cols, predicate)`. Local files only.
99#[cfg(all(feature = "parquet", feature = "json"))]
100#[pyfunction]
101pub fn _bench_parquet_metadata_bincode_size(
102    path: &str,
103    projection: Option<Vec<String>>,
104    predicate: Vec<String>,
105) -> PyResult<usize> {
106    use polars_parquet::read::read_metadata;
107    use polars_utils::pl_serialize;
108    use polars_utils::pl_str::PlSmallStr;
109
110    let file = std::fs::File::open(path).map_err(|e| PyPolarsErr::Other(e.to_string()))?;
111    let metadata = read_metadata(&mut BufReader::new(file)).map_err(PyPolarsErr::from)?;
112
113    // Match the IR-plan serializer's framing format.
114    let bytes = match projection {
115        None => {
116            pl_serialize::serialize_to_bytes::<_, false>(&metadata).map_err(PyPolarsErr::from)?
117        },
118        Some(keep) => {
119            let keep_pl: Vec<PlSmallStr> = keep.into_iter().map(PlSmallStr::from).collect();
120            let pred_pl: Vec<PlSmallStr> = predicate.into_iter().map(PlSmallStr::from).collect();
121            let pruned = metadata
122                .pruned(&keep_pl, &pred_pl)
123                .map_err(|e| PyPolarsErr::Other(e.to_string()))?;
124            pl_serialize::serialize_to_bytes::<_, false>(&pruned).map_err(PyPolarsErr::from)?
125        },
126    };
127    Ok(bytes.len())
128}
129
130/// Decode a parquet footer, apply `FileMetadata::pruned(projection, predicate)`,
131/// and return the result as a JSON string. Format-agnostic custom serde lets
132/// the same wire DTOs emit JSON for inspection or bincode for dispatch.
133///
134/// Used by py-polars tests to assert structural prune behavior (only kept
135/// columns survive, stats only on predicate columns). Local files only.
136#[cfg(all(feature = "parquet", feature = "json"))]
137#[pyfunction]
138pub fn _parquet_metadata_pruned_json(
139    path: &str,
140    projection: Vec<String>,
141    predicate: Vec<String>,
142) -> PyResult<String> {
143    use polars_parquet::read::read_metadata;
144    use polars_utils::pl_str::PlSmallStr;
145
146    let file = std::fs::File::open(path).map_err(|e| PyPolarsErr::Other(e.to_string()))?;
147    let metadata = read_metadata(&mut BufReader::new(file)).map_err(PyPolarsErr::from)?;
148
149    let keep: Vec<PlSmallStr> = projection.into_iter().map(PlSmallStr::from).collect();
150    let pred: Vec<PlSmallStr> = predicate.into_iter().map(PlSmallStr::from).collect();
151    let pruned = metadata
152        .pruned(&keep, &pred)
153        .map_err(|e| PyPolarsErr::Other(e.to_string()))?;
154
155    serde_json::to_string(&pruned).map_err(|e| PyPolarsErr::Other(e.to_string()).into())
156}
157
158#[cfg(any(feature = "ipc", feature = "parquet"))]
159fn fields_to_pydict(schema: &ArrowSchema, dict: &Bound<'_, PyDict>) -> PyResult<()> {
160    for field in schema.iter_values() {
161        let dt = Wrap(polars::prelude::DataType::from_arrow_field(field));
162        dict.set_item(field.name.as_str(), &dt)?;
163    }
164    Ok(())
165}
166
167#[cfg(feature = "clipboard")]
168#[pyfunction]
169pub fn read_clipboard_string() -> PyResult<String> {
170    use arboard;
171    let mut clipboard =
172        arboard::Clipboard::new().map_err(|e| PyPolarsErr::Other(format!("{e}")))?;
173    let result = clipboard
174        .get_text()
175        .map_err(|e| PyPolarsErr::Other(format!("{e}")))?;
176    Ok(result)
177}
178
179#[cfg(feature = "clipboard")]
180#[pyfunction]
181pub fn write_clipboard_string(s: &str) -> PyResult<()> {
182    use arboard;
183    let mut clipboard =
184        arboard::Clipboard::new().map_err(|e| PyPolarsErr::Other(format!("{e}")))?;
185    clipboard
186        .set_text(s)
187        .map_err(|e| PyPolarsErr::Other(format!("{e}")))?;
188    Ok(())
189}