exon_sdf/
schema_builder.rs

1// Copyright 2024 WHERE TRUE Technologies.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use arrow::datatypes::{Field, Schema};
18use exon_common::TableSchema;
19
20use crate::record::Data;
21
22/// Builds a schema for an SDF file.
23pub struct SDFSchemaBuilder {
24    file_fields: Vec<Field>,
25    partition_fields: Vec<Field>,
26}
27
28impl Default for SDFSchemaBuilder {
29    fn default() -> Self {
30        // by default, data is a struct with a single field, which is a string called "canonical_smiles"
31        let data_fields = vec![Field::new(
32            "canonical_smiles",
33            arrow::datatypes::DataType::Utf8,
34            false,
35        )];
36        let struct_type = arrow::datatypes::DataType::Struct(data_fields.into());
37
38        let file_fields = vec![
39            // header which is a string
40            Field::new("header", arrow::datatypes::DataType::Utf8, false),
41            // atom count which is a 32-bit unsigned integer
42            Field::new("atom_count", arrow::datatypes::DataType::UInt32, false),
43            // bond count which is a 32-bit unsigned integer
44            Field::new("bond_count", arrow::datatypes::DataType::UInt32, false),
45            // data which is a struct with a single field, which is a string called "canonical_smiles"
46            Field::new("data", struct_type, false),
47        ];
48
49        Self {
50            file_fields,
51            partition_fields: Vec::new(),
52        }
53    }
54}
55
56impl SDFSchemaBuilder {
57    /// Creates a new schema builder.
58    pub fn new() -> Self {
59        SDFSchemaBuilder {
60            file_fields: Vec::new(),
61            partition_fields: Vec::new(),
62        }
63    }
64
65    /// Adds a field to the schema.
66    pub fn add_field(&mut self, field: Field) {
67        self.file_fields.push(field);
68    }
69
70    /// Adds a partition field to the schema.
71    pub fn add_partition_field(&mut self, field: Field) {
72        self.partition_fields.push(field);
73    }
74
75    /// Update the data field based on the input data.
76    pub fn update_data_field(&mut self, data: &Data) {
77        let new_fields = data
78            .into_iter()
79            .map(|d| Field::new(d.header(), arrow::datatypes::DataType::Utf8, true))
80            .collect::<Vec<_>>();
81
82        let struct_type = arrow::datatypes::DataType::Struct(new_fields.into());
83        self.file_fields[3] = Field::new("data", struct_type, false);
84    }
85
86    /// Builds the schema.
87    pub fn build(self) -> TableSchema {
88        let mut fields = self.file_fields.clone();
89        fields.extend_from_slice(&self.partition_fields);
90
91        let schema = Schema::new(fields);
92
93        let projection = (0..self.file_fields.len()).collect::<Vec<_>>();
94
95        TableSchema::new(Arc::new(schema), projection)
96    }
97}