Skip to main content

datafusion_spark/function/string/
soundex.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use arrow::array::{ArrayRef, OffsetSizeTrait, StringArray};
19use arrow::datatypes::DataType;
20use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
21use datafusion_common::utils::take_function_args;
22use datafusion_common::{Result, exec_err};
23use datafusion_expr::{ColumnarValue, Signature, Volatility};
24use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl};
25use datafusion_functions::utils::make_scalar_function;
26use std::sync::Arc;
27
28/// Spark-compatible `soundex` expression
29/// <https://spark.apache.org/docs/latest/api/sql/index.html#soundex>
30#[derive(Debug, PartialEq, Eq, Hash)]
31pub struct SparkSoundex {
32    signature: Signature,
33}
34
35impl Default for SparkSoundex {
36    fn default() -> Self {
37        Self::new()
38    }
39}
40
41impl SparkSoundex {
42    pub fn new() -> Self {
43        Self {
44            signature: Signature::string(1, Volatility::Immutable),
45        }
46    }
47}
48
49impl ScalarUDFImpl for SparkSoundex {
50    fn name(&self) -> &str {
51        "soundex"
52    }
53
54    fn signature(&self) -> &Signature {
55        &self.signature
56    }
57
58    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
59        match &arg_types[0] {
60            DataType::LargeUtf8 => Ok(DataType::LargeUtf8),
61            _ => Ok(DataType::Utf8),
62        }
63    }
64
65    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
66        make_scalar_function(spark_soundex_inner, vec![])(&args.args)
67    }
68}
69
70fn spark_soundex_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
71    let [array] = take_function_args("soundex", arg)?;
72    match &array.data_type() {
73        DataType::Utf8 => soundex_array::<i32>(array),
74        DataType::LargeUtf8 => soundex_array::<i64>(array),
75        DataType::Utf8View => soundex_view(array),
76        other => {
77            exec_err!("unsupported data type {other:?} for function `soundex`")
78        }
79    }
80}
81
82fn soundex_array<T: OffsetSizeTrait>(array: &ArrayRef) -> Result<ArrayRef> {
83    let str_array = as_generic_string_array::<T>(array)?;
84    let result = str_array
85        .iter()
86        .map(|s| s.map(compute_soundex))
87        .collect::<StringArray>();
88    Ok(Arc::new(result))
89}
90
91fn soundex_view(str_view: &ArrayRef) -> Result<ArrayRef> {
92    let str_array = as_string_view_array(str_view)?;
93    let result = str_array
94        .iter()
95        .map(|opt_str| opt_str.map(compute_soundex))
96        .collect::<StringArray>();
97    Ok(Arc::new(result) as ArrayRef)
98}
99
100fn classify_char(c: char) -> Option<char> {
101    match c.to_ascii_uppercase() {
102        'B' | 'F' | 'P' | 'V' => Some('1'),
103        'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => Some('2'),
104        'D' | 'T' => Some('3'),
105        'L' => Some('4'),
106        'M' | 'N' => Some('5'),
107        'R' => Some('6'),
108        _ => None,
109    }
110}
111
112fn is_ignored(c: char) -> bool {
113    matches!(c.to_ascii_uppercase(), 'H' | 'W')
114}
115
116fn compute_soundex(s: &str) -> String {
117    let mut chars = s.chars();
118
119    let first_char = match chars.next() {
120        Some(c) if c.is_ascii_alphabetic() => c.to_ascii_uppercase(),
121        _ => return s.to_string(),
122    };
123
124    let mut soundex_code = String::with_capacity(4);
125    soundex_code.push(first_char);
126    let mut last_code = classify_char(first_char);
127
128    for c in chars {
129        if soundex_code.len() >= 4 {
130            break;
131        }
132
133        if is_ignored(c) {
134            continue;
135        }
136
137        match classify_char(c) {
138            Some(code) => {
139                if last_code != Some(code) {
140                    soundex_code.push(code);
141                }
142                last_code = Some(code);
143            }
144            None => {
145                last_code = None;
146            }
147        }
148    }
149    format!("{soundex_code:0<4}")
150}