datafusion_functions/crypto/
md5.rs1use arrow::{array::StringViewArray, datatypes::DataType};
19use datafusion_common::{
20 Result, ScalarValue,
21 cast::as_binary_array,
22 internal_err,
23 types::{logical_binary, logical_string},
24 utils::take_function_args,
25};
26use datafusion_expr::{
27 ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature,
28 TypeSignature, Volatility,
29};
30use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
31use datafusion_macros::user_doc;
32use std::{any::Any, sync::Arc};
33
34use crate::crypto::basic::{DigestAlgorithm, digest_process};
35
36#[user_doc(
37 doc_section(label = "Hashing Functions"),
38 description = "Computes an MD5 128-bit checksum for a string expression.",
39 syntax_example = "md5(expression)",
40 sql_example = r#"```sql
41> select md5('foo');
42+----------------------------------+
43| md5(Utf8("foo")) |
44+----------------------------------+
45| acbd18db4cc2f85cedef654fccc4a4d8 |
46+----------------------------------+
47```"#,
48 standard_argument(name = "expression", prefix = "String")
49)]
50#[derive(Debug, PartialEq, Eq, Hash)]
51pub struct Md5Func {
52 signature: Signature,
53}
54
55impl Default for Md5Func {
56 fn default() -> Self {
57 Self::new()
58 }
59}
60
61impl Md5Func {
62 pub fn new() -> Self {
63 Self {
64 signature: Signature::one_of(
65 vec![
66 TypeSignature::Coercible(vec![Coercion::new_exact(
67 TypeSignatureClass::Native(logical_string()),
68 )]),
69 TypeSignature::Coercible(vec![Coercion::new_exact(
70 TypeSignatureClass::Native(logical_binary()),
71 )]),
72 ],
73 Volatility::Immutable,
74 ),
75 }
76 }
77}
78
79impl ScalarUDFImpl for Md5Func {
80 fn as_any(&self) -> &dyn Any {
81 self
82 }
83
84 fn name(&self) -> &str {
85 "md5"
86 }
87
88 fn signature(&self) -> &Signature {
89 &self.signature
90 }
91
92 fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
93 Ok(DataType::Utf8View)
94 }
95
96 fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
97 md5(&args.args)
98 }
99
100 fn documentation(&self) -> Option<&Documentation> {
101 self.doc()
102 }
103}
104
105const HEX_CHARS_LOWER: &[u8; 16] = b"0123456789abcdef";
107
108#[inline]
111fn hex_encode(data: impl AsRef<[u8]>) -> String {
112 let bytes = data.as_ref();
113 let mut s = String::with_capacity(bytes.len() * 2);
114 for &b in bytes {
115 s.push(HEX_CHARS_LOWER[(b >> 4) as usize] as char);
116 s.push(HEX_CHARS_LOWER[(b & 0x0f) as usize] as char);
117 }
118 s
119}
120
121fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> {
122 let [data] = take_function_args("md5", args)?;
123 let value = digest_process(data, DigestAlgorithm::Md5)?;
124
125 Ok(match value {
127 ColumnarValue::Array(array) => {
128 let binary_array = as_binary_array(&array)?;
129 let string_array: StringViewArray =
130 binary_array.iter().map(|opt| opt.map(hex_encode)).collect();
131 ColumnarValue::Array(Arc::new(string_array))
132 }
133 ColumnarValue::Scalar(ScalarValue::Binary(opt)) => {
134 ColumnarValue::Scalar(ScalarValue::Utf8View(opt.map(hex_encode)))
135 }
136 _ => return internal_err!("Impossibly got invalid results from digest"),
137 })
138}