qdrant_datafusion/
utils.rs

1//! Various utility functions for working with schema and data.
2
3use std::collections::HashSet;
4
5use datafusion::arrow::datatypes::Schema;
6
7/// Specification for selecting which vectors to retrieve from `Qdrant`.
8///
9/// This enum determines what vector data will be requested from `Qdrant` during query execution.
10/// The selection is optimized based on the `DataFusion` schema projection.
11#[derive(Debug, Clone)]
12pub enum VectorSelectorSpec {
13    /// No vectors needed - only metadata fields (id, payload) requested
14    None,
15    /// All vectors needed - either unnamed collection or all named vectors requested
16    All,
17    /// Specific named vectors needed - only fetch these vector fields
18    Named(Vec<String>),
19}
20
21/// Build an optimal vector selector based on the projected schema.
22///
23/// This function analyzes the `DataFusion` schema (after projection) to determine
24/// which vector fields are actually needed, enabling efficient queries that only
25/// fetch required data from `Qdrant`.
26///
27/// # Arguments
28/// * `schema` - The Arrow schema (potentially projected) defining which fields are needed
29///
30/// # Returns
31/// A `VectorSelectorSpec` that tells `Qdrant` exactly which vectors to include in the response.
32///
33/// # Examples
34/// ```rust,ignore
35/// use datafusion::arrow::datatypes::{Schema, Field, DataType};
36/// use qdrant_datafusion::utils::{build_vector_selector, VectorSelectorSpec};
37/// use std::sync::Arc;
38///
39/// // Schema with only metadata - no vectors needed
40/// let metadata_schema = Schema::new(vec![
41///     Field::new("id", DataType::Utf8, false),
42///     Field::new("payload", DataType::Utf8, true),
43/// ]);
44/// assert!(matches!(build_vector_selector(&metadata_schema), VectorSelectorSpec::None));
45///
46/// // Schema with unnamed vector - fetch all
47/// let unnamed_schema = Schema::new(vec![
48///     Field::new("id", DataType::Utf8, false),
49///     Field::new("vector", DataType::List(Arc::new(Field::new("item", DataType::Float32, true))), true),
50/// ]);
51/// assert!(matches!(build_vector_selector(&unnamed_schema), VectorSelectorSpec::All));
52/// ```
53pub fn build_vector_selector(schema: &Schema) -> VectorSelectorSpec {
54    let mut vector_names: HashSet<_> = schema
55        .fields()
56        .iter()
57        .filter(|f| !["id", "payload"].contains(&f.name().as_str()))
58        .map(|f| f.name())
59        .map(|name| {
60            if name.ends_with("_indices") || name.ends_with("_values") {
61                // Extract base name for sparse vectors
62                name.trim_end_matches("_indices").trim_end_matches("_values")
63            } else {
64                name
65            }
66        })
67        .collect();
68
69    if vector_names.is_empty() {
70        return VectorSelectorSpec::None;
71    }
72
73    let has_unnamed_vector = vector_names.remove("vector");
74    if has_unnamed_vector || vector_names.is_empty() {
75        // Unnamed vector collection - use simple boolean selector
76        VectorSelectorSpec::All
77    } else {
78        // Specific named vectors
79        VectorSelectorSpec::Named(
80            vector_names.into_iter().map(ToString::to_string).collect::<Vec<_>>(),
81        )
82    }
83}
84
85/// Determine if payload data should be included in `Qdrant` queries.
86///
87/// This function checks if the schema includes a "payload" field, which indicates
88/// that payload data is needed and should be fetched from `Qdrant`.
89///
90/// # Arguments
91/// * `schema` - The Arrow schema to analyze
92///
93/// # Returns
94/// `true` if payload field is present in schema, `false` otherwise
95///
96/// # Examples
97/// ```rust,ignore
98/// use datafusion::arrow::datatypes::{Schema, Field, DataType};
99/// use qdrant_datafusion::utils::build_payload_selector;
100///
101/// // Schema with payload field
102/// let with_payload = Schema::new(vec![
103///     Field::new("id", DataType::Utf8, false),
104///     Field::new("payload", DataType::Utf8, true),
105/// ]);
106/// assert_eq!(build_payload_selector(&with_payload), true);
107///
108/// // Schema without payload
109/// let no_payload = Schema::new(vec![
110///     Field::new("id", DataType::Utf8, false),
111/// ]);
112/// assert_eq!(build_payload_selector(&no_payload), false);
113/// ```
114pub fn build_payload_selector(schema: &Schema) -> bool {
115    schema.fields().iter().any(|f| f.name() == "payload")
116}