qdrant_datafusion/utils.rs
1//! Various utility functions for working with schema and data.
2
3use std::collections::HashSet;
4
5use datafusion::arrow::datatypes::Schema;
6
7/// Specification for selecting which vectors to retrieve from `Qdrant`.
8///
9/// This enum determines what vector data will be requested from `Qdrant` during query execution.
10/// The selection is optimized based on the `DataFusion` schema projection.
11#[derive(Debug, Clone)]
12pub enum VectorSelectorSpec {
13 /// No vectors needed - only metadata fields (id, payload) requested
14 None,
15 /// All vectors needed - either unnamed collection or all named vectors requested
16 All,
17 /// Specific named vectors needed - only fetch these vector fields
18 Named(Vec<String>),
19}
20
21/// Build an optimal vector selector based on the projected schema.
22///
23/// This function analyzes the `DataFusion` schema (after projection) to determine
24/// which vector fields are actually needed, enabling efficient queries that only
25/// fetch required data from `Qdrant`.
26///
27/// # Arguments
28/// * `schema` - The Arrow schema (potentially projected) defining which fields are needed
29///
30/// # Returns
31/// A `VectorSelectorSpec` that tells `Qdrant` exactly which vectors to include in the response.
32///
33/// # Examples
34/// ```rust,ignore
35/// use datafusion::arrow::datatypes::{Schema, Field, DataType};
36/// use qdrant_datafusion::utils::{build_vector_selector, VectorSelectorSpec};
37/// use std::sync::Arc;
38///
39/// // Schema with only metadata - no vectors needed
40/// let metadata_schema = Schema::new(vec![
41/// Field::new("id", DataType::Utf8, false),
42/// Field::new("payload", DataType::Utf8, true),
43/// ]);
44/// assert!(matches!(build_vector_selector(&metadata_schema), VectorSelectorSpec::None));
45///
46/// // Schema with unnamed vector - fetch all
47/// let unnamed_schema = Schema::new(vec![
48/// Field::new("id", DataType::Utf8, false),
49/// Field::new("vector", DataType::List(Arc::new(Field::new("item", DataType::Float32, true))), true),
50/// ]);
51/// assert!(matches!(build_vector_selector(&unnamed_schema), VectorSelectorSpec::All));
52/// ```
53pub fn build_vector_selector(schema: &Schema) -> VectorSelectorSpec {
54 let mut vector_names: HashSet<_> = schema
55 .fields()
56 .iter()
57 .filter(|f| !["id", "payload"].contains(&f.name().as_str()))
58 .map(|f| f.name())
59 .map(|name| {
60 if name.ends_with("_indices") || name.ends_with("_values") {
61 // Extract base name for sparse vectors
62 name.trim_end_matches("_indices").trim_end_matches("_values")
63 } else {
64 name
65 }
66 })
67 .collect();
68
69 if vector_names.is_empty() {
70 return VectorSelectorSpec::None;
71 }
72
73 let has_unnamed_vector = vector_names.remove("vector");
74 if has_unnamed_vector || vector_names.is_empty() {
75 // Unnamed vector collection - use simple boolean selector
76 VectorSelectorSpec::All
77 } else {
78 // Specific named vectors
79 VectorSelectorSpec::Named(
80 vector_names.into_iter().map(ToString::to_string).collect::<Vec<_>>(),
81 )
82 }
83}
84
85/// Determine if payload data should be included in `Qdrant` queries.
86///
87/// This function checks if the schema includes a "payload" field, which indicates
88/// that payload data is needed and should be fetched from `Qdrant`.
89///
90/// # Arguments
91/// * `schema` - The Arrow schema to analyze
92///
93/// # Returns
94/// `true` if payload field is present in schema, `false` otherwise
95///
96/// # Examples
97/// ```rust,ignore
98/// use datafusion::arrow::datatypes::{Schema, Field, DataType};
99/// use qdrant_datafusion::utils::build_payload_selector;
100///
101/// // Schema with payload field
102/// let with_payload = Schema::new(vec![
103/// Field::new("id", DataType::Utf8, false),
104/// Field::new("payload", DataType::Utf8, true),
105/// ]);
106/// assert_eq!(build_payload_selector(&with_payload), true);
107///
108/// // Schema without payload
109/// let no_payload = Schema::new(vec![
110/// Field::new("id", DataType::Utf8, false),
111/// ]);
112/// assert_eq!(build_payload_selector(&no_payload), false);
113/// ```
114pub fn build_payload_selector(schema: &Schema) -> bool {
115 schema.fields().iter().any(|f| f.name() == "payload")
116}