datafusion_spark/function/string/
soundex.rs1use arrow::array::{ArrayRef, OffsetSizeTrait, StringArray};
19use arrow::datatypes::DataType;
20use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
21use datafusion_common::utils::take_function_args;
22use datafusion_common::{Result, exec_err};
23use datafusion_expr::{ColumnarValue, Signature, Volatility};
24use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl};
25use datafusion_functions::utils::make_scalar_function;
26use std::sync::Arc;
27
28#[derive(Debug, PartialEq, Eq, Hash)]
31pub struct SparkSoundex {
32 signature: Signature,
33}
34
35impl Default for SparkSoundex {
36 fn default() -> Self {
37 Self::new()
38 }
39}
40
41impl SparkSoundex {
42 pub fn new() -> Self {
43 Self {
44 signature: Signature::string(1, Volatility::Immutable),
45 }
46 }
47}
48
49impl ScalarUDFImpl for SparkSoundex {
50 fn name(&self) -> &str {
51 "soundex"
52 }
53
54 fn signature(&self) -> &Signature {
55 &self.signature
56 }
57
58 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
59 match &arg_types[0] {
60 DataType::LargeUtf8 => Ok(DataType::LargeUtf8),
61 _ => Ok(DataType::Utf8),
62 }
63 }
64
65 fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
66 make_scalar_function(spark_soundex_inner, vec![])(&args.args)
67 }
68}
69
70fn spark_soundex_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
71 let [array] = take_function_args("soundex", arg)?;
72 match &array.data_type() {
73 DataType::Utf8 => soundex_array::<i32>(array),
74 DataType::LargeUtf8 => soundex_array::<i64>(array),
75 DataType::Utf8View => soundex_view(array),
76 other => {
77 exec_err!("unsupported data type {other:?} for function `soundex`")
78 }
79 }
80}
81
82fn soundex_array<T: OffsetSizeTrait>(array: &ArrayRef) -> Result<ArrayRef> {
83 let str_array = as_generic_string_array::<T>(array)?;
84 let result = str_array
85 .iter()
86 .map(|s| s.map(compute_soundex))
87 .collect::<StringArray>();
88 Ok(Arc::new(result))
89}
90
91fn soundex_view(str_view: &ArrayRef) -> Result<ArrayRef> {
92 let str_array = as_string_view_array(str_view)?;
93 let result = str_array
94 .iter()
95 .map(|opt_str| opt_str.map(compute_soundex))
96 .collect::<StringArray>();
97 Ok(Arc::new(result) as ArrayRef)
98}
99
100fn classify_char(c: char) -> Option<char> {
101 match c.to_ascii_uppercase() {
102 'B' | 'F' | 'P' | 'V' => Some('1'),
103 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => Some('2'),
104 'D' | 'T' => Some('3'),
105 'L' => Some('4'),
106 'M' | 'N' => Some('5'),
107 'R' => Some('6'),
108 _ => None,
109 }
110}
111
112fn is_ignored(c: char) -> bool {
113 matches!(c.to_ascii_uppercase(), 'H' | 'W')
114}
115
116fn compute_soundex(s: &str) -> String {
117 let mut chars = s.chars();
118
119 let first_char = match chars.next() {
120 Some(c) if c.is_ascii_alphabetic() => c.to_ascii_uppercase(),
121 _ => return s.to_string(),
122 };
123
124 let mut soundex_code = String::with_capacity(4);
125 soundex_code.push(first_char);
126 let mut last_code = classify_char(first_char);
127
128 for c in chars {
129 if soundex_code.len() >= 4 {
130 break;
131 }
132
133 if is_ignored(c) {
134 continue;
135 }
136
137 match classify_char(c) {
138 Some(code) => {
139 if last_code != Some(code) {
140 soundex_code.push(code);
141 }
142 last_code = Some(code);
143 }
144 None => {
145 last_code = None;
146 }
147 }
148 }
149 format!("{soundex_code:0<4}")
150}