use std::any::Any;
use std::sync::Arc;
use crate::core::column::{Column, ColumnTrait, ColumnType};
use crate::core::error::{Error, Result};
use crate::storage::simple_unified_string_pool::{
SimpleStringPoolStats, SimpleStringView, SimpleUnifiedStringPool,
};
#[derive(Debug, Clone)]
pub struct SimpleZeroCopyStringColumn {
pool: Arc<SimpleUnifiedStringPool>,
string_ids: Arc<[u32]>,
null_mask: Option<Arc<[u8]>>,
name: Option<String>,
}
impl SimpleZeroCopyStringColumn {
pub fn new(data: Vec<String>) -> Result<Self> {
let pool = Arc::new(SimpleUnifiedStringPool::new());
let string_ids = pool.add_strings(&data)?;
Ok(Self {
pool,
string_ids: string_ids.into(),
null_mask: None,
name: None,
})
}
pub fn with_shared_pool(data: Vec<String>, pool: Arc<SimpleUnifiedStringPool>) -> Result<Self> {
let string_ids = pool.add_strings(&data)?;
Ok(Self {
pool,
string_ids: string_ids.into(),
null_mask: None,
name: None,
})
}
pub fn with_name(data: Vec<String>, name: impl Into<String>) -> Result<Self> {
let mut column = Self::new(data)?;
column.name = Some(name.into());
Ok(column)
}
pub fn with_nulls(data: Vec<String>, nulls: Vec<bool>) -> Result<Self> {
let null_mask = if nulls.iter().any(|&is_null| is_null) {
Some(crate::column::common::utils::create_bitmask(&nulls))
} else {
None
};
let mut column = Self::new(data)?;
column.null_mask = null_mask;
Ok(column)
}
pub fn set_name(&mut self, name: impl Into<String>) {
self.name = Some(name.into());
}
pub fn get_name(&self) -> Option<&str> {
self.name.as_deref()
}
pub fn get_view(&self, index: usize) -> Result<Option<SimpleStringView>> {
if index >= self.string_ids.len() {
return Err(Error::IndexOutOfBounds {
index,
size: self.string_ids.len(),
});
}
if let Some(ref mask) = self.null_mask {
let byte_idx = index / 8;
let bit_idx = index % 8;
if byte_idx < mask.len() && (mask[byte_idx] & (1 << bit_idx)) != 0 {
return Ok(None);
}
}
let string_id = self.string_ids[index];
let view = self.pool.get_string(string_id)?;
Ok(Some(view))
}
pub fn get(&self, index: usize) -> Result<Option<String>> {
match self.get_view(index)? {
Some(view) => Ok(Some(view.as_str()?)),
None => Ok(None),
}
}
pub fn get_views(&self, indices: &[usize]) -> Result<Vec<Option<SimpleStringView>>> {
let mut result = Vec::with_capacity(indices.len());
for &index in indices {
result.push(self.get_view(index)?);
}
Ok(result)
}
pub fn to_strings(&self) -> Result<Vec<Option<String>>> {
let mut result = Vec::with_capacity(self.string_ids.len());
for i in 0..self.string_ids.len() {
result.push(self.get(i)?);
}
Ok(result)
}
pub fn map_views<F, R>(&self, mut f: F) -> Result<Vec<R>>
where
F: FnMut(Option<SimpleStringView>) -> R,
{
let mut result = Vec::with_capacity(self.string_ids.len());
for i in 0..self.string_ids.len() {
let view = self.get_view(i)?;
result.push(f(view));
}
Ok(result)
}
pub fn filter_views<F>(&self, mut predicate: F) -> Result<Vec<usize>>
where
F: FnMut(&str) -> bool,
{
let mut result = Vec::new();
for i in 0..self.string_ids.len() {
if let Some(view) = self.get_view(i)? {
let matches = view.with_str_ref(&mut predicate)?;
if matches {
result.push(i);
}
}
}
Ok(result)
}
pub fn contains(&self, target: &str) -> Result<bool> {
for i in 0..self.string_ids.len() {
if let Some(view) = self.get_view(i)? {
let is_match = view.with_str_ref(|s| s == target)?;
if is_match {
return Ok(true);
}
}
}
Ok(false)
}
pub fn count_occurrences(&self, target: &str) -> Result<usize> {
let mut count = 0;
for i in 0..self.string_ids.len() {
if let Some(view) = self.get_view(i)? {
let is_match = view.with_str_ref(|s| s == target)?;
if is_match {
count += 1;
}
}
}
Ok(count)
}
pub fn unique_views(&self) -> Result<Vec<SimpleStringView>> {
let mut unique_views = Vec::new();
let mut seen_hashes = std::collections::HashSet::new();
for i in 0..self.string_ids.len() {
if let Some(view) = self.get_view(i)? {
let metadata = view.metadata();
if seen_hashes.insert(metadata.hash) {
unique_views.push(view);
}
}
}
Ok(unique_views)
}
pub fn string_lengths(&self) -> Result<Vec<Option<usize>>> {
let mut lengths = Vec::with_capacity(self.string_ids.len());
for i in 0..self.string_ids.len() {
match self.get_view(i)? {
Some(view) => lengths.push(Some(view.len())),
None => lengths.push(None),
}
}
Ok(lengths)
}
pub fn pool(&self) -> &Arc<SimpleUnifiedStringPool> {
&self.pool
}
pub fn pool_stats(&self) -> Result<SimpleStringPoolStats> {
self.pool.stats()
}
pub fn with_string_ids(&self, string_ids: Vec<u32>) -> Self {
Self {
pool: Arc::clone(&self.pool),
string_ids: string_ids.into(),
null_mask: self.null_mask.clone(),
name: self.name.clone(),
}
}
pub fn to_lowercase_optimized(&self) -> Result<SimpleZeroCopyStringColumn> {
let new_strings = self.map_views(|view_opt| match view_opt {
Some(view) => view
.as_str()
.unwrap_or_else(|_| String::new())
.to_lowercase(),
None => String::new(),
})?;
SimpleZeroCopyStringColumn::with_shared_pool(new_strings, Arc::clone(&self.pool))
}
pub fn to_uppercase_optimized(&self) -> Result<SimpleZeroCopyStringColumn> {
let new_strings = self.map_views(|view_opt| match view_opt {
Some(view) => view
.as_str()
.unwrap_or_else(|_| String::new())
.to_uppercase(),
None => String::new(),
})?;
SimpleZeroCopyStringColumn::with_shared_pool(new_strings, Arc::clone(&self.pool))
}
pub fn concat_with(
&self,
other: &SimpleZeroCopyStringColumn,
separator: &str,
) -> Result<SimpleZeroCopyStringColumn> {
if self.string_ids.len() != other.string_ids.len() {
return Err(Error::InconsistentRowCount {
expected: self.string_ids.len(),
found: other.string_ids.len(),
});
}
let mut new_strings = Vec::with_capacity(self.string_ids.len());
for i in 0..self.string_ids.len() {
let left = self.get_view(i)?;
let right = other.get_view(i)?;
match (left, right) {
(Some(left_view), Some(right_view)) => {
let concatenated = format!(
"{}{}{}",
left_view.as_str()?,
separator,
right_view.as_str()?
);
new_strings.push(concatenated);
}
(Some(left_view), None) => {
new_strings.push(left_view.as_str()?);
}
(None, Some(right_view)) => {
new_strings.push(right_view.as_str()?);
}
(None, None) => {
new_strings.push(String::new()); }
}
}
SimpleZeroCopyStringColumn::with_shared_pool(new_strings, Arc::clone(&self.pool))
}
}
impl ColumnTrait for SimpleZeroCopyStringColumn {
fn len(&self) -> usize {
self.string_ids.len()
}
fn is_empty(&self) -> bool {
self.string_ids.is_empty()
}
fn column_type(&self) -> ColumnType {
ColumnType::String
}
fn name(&self) -> Option<&str> {
self.name.as_deref()
}
fn clone_column(&self) -> Column {
let strings = self.to_strings().unwrap_or_default();
let string_values: Vec<String> = strings
.into_iter()
.map(|opt| opt.unwrap_or_default())
.collect();
Column::String(crate::column::StringColumn::new(string_values))
}
fn as_any(&self) -> &dyn Any {
self
}
}
pub trait SimpleZeroCopyStringOps {
fn transform_zero_copy<F, R>(&self, f: F) -> Result<Vec<R>>
where
F: FnMut(Option<SimpleStringView>) -> R;
fn substring_views(&self, start: usize, end: usize) -> Result<SimpleZeroCopyStringColumn>;
}
impl SimpleZeroCopyStringOps for SimpleZeroCopyStringColumn {
fn transform_zero_copy<F, R>(&self, f: F) -> Result<Vec<R>>
where
F: FnMut(Option<SimpleStringView>) -> R,
{
self.map_views(f)
}
fn substring_views(&self, start: usize, end: usize) -> Result<SimpleZeroCopyStringColumn> {
let mut new_strings = Vec::with_capacity(self.string_ids.len());
for i in 0..self.string_ids.len() {
if let Some(view) = self.get_view(i)? {
let str_len = view.len();
let actual_start = start.min(str_len);
let actual_end = end.min(str_len);
if actual_start < actual_end {
let substring = view.substring(actual_start, actual_end)?;
new_strings.push(substring.as_str()?);
} else {
new_strings.push(String::new());
}
} else {
new_strings.push(String::new());
}
}
SimpleZeroCopyStringColumn::with_shared_pool(new_strings, Arc::clone(&self.pool))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simple_zero_copy_string_column_creation() {
let data = vec!["hello".to_string(), "world".to_string(), "test".to_string()];
let column =
SimpleZeroCopyStringColumn::new(data.clone()).expect("operation should succeed");
assert_eq!(column.len(), 3);
assert!(!column.is_empty());
assert_eq!(column.column_type(), ColumnType::String);
assert_eq!(
column
.get(0)
.expect("operation should succeed")
.expect("operation should succeed"),
"hello"
);
assert_eq!(
column
.get(1)
.expect("operation should succeed")
.expect("operation should succeed"),
"world"
);
assert_eq!(
column
.get(2)
.expect("operation should succeed")
.expect("operation should succeed"),
"test"
);
}
#[test]
fn test_zero_copy_views() {
let data = vec!["hello".to_string(), "world".to_string()];
let column = SimpleZeroCopyStringColumn::new(data).expect("operation should succeed");
let view1 = column
.get_view(0)
.expect("operation should succeed")
.expect("operation should succeed");
let view2 = column
.get_view(1)
.expect("operation should succeed")
.expect("operation should succeed");
assert_eq!(view1.as_str().expect("operation should succeed"), "hello");
assert_eq!(view2.as_str().expect("operation should succeed"), "world");
assert_eq!(view1.len(), 5);
assert_eq!(view2.len(), 5);
}
#[test]
fn test_string_deduplication() {
let data = vec![
"hello".to_string(),
"world".to_string(),
"hello".to_string(), "test".to_string(),
"world".to_string(), ];
let column = SimpleZeroCopyStringColumn::new(data).expect("operation should succeed");
let stats = column.pool_stats().expect("operation should succeed");
assert_eq!(stats.total_strings, 5); assert_eq!(stats.unique_strings, 3); assert!(
stats.deduplication_ratio > 0.0,
"Deduplication ratio: {}",
stats.deduplication_ratio
);
assert!(
(stats.deduplication_ratio - 0.4).abs() < 0.001,
"Expected ~0.4, got {}",
stats.deduplication_ratio
);
assert_eq!(
column
.get(0)
.expect("operation should succeed")
.expect("operation should succeed"),
"hello"
);
assert_eq!(
column
.get(1)
.expect("operation should succeed")
.expect("operation should succeed"),
"world"
);
assert_eq!(
column
.get(2)
.expect("operation should succeed")
.expect("operation should succeed"),
"hello"
);
assert_eq!(
column
.get(3)
.expect("operation should succeed")
.expect("operation should succeed"),
"test"
);
assert_eq!(
column
.get(4)
.expect("operation should succeed")
.expect("operation should succeed"),
"world"
);
}
#[test]
fn test_zero_copy_operations() {
let data = vec!["hello".to_string(), "world".to_string(), "test".to_string()];
let column = SimpleZeroCopyStringColumn::new(data).expect("operation should succeed");
assert!(column.contains("hello").expect("operation should succeed"));
assert!(column.contains("world").expect("operation should succeed"));
assert!(!column
.contains("missing")
.expect("operation should succeed"));
assert_eq!(
column
.count_occurrences("hello")
.expect("operation should succeed"),
1
);
assert_eq!(
column
.count_occurrences("missing")
.expect("operation should succeed"),
0
);
let lengths = column.string_lengths().expect("operation should succeed");
assert_eq!(lengths, vec![Some(5), Some(5), Some(4)]);
}
#[test]
fn test_zero_copy_filtering() {
let data = vec![
"apple".to_string(),
"banana".to_string(),
"cherry".to_string(),
"apricot".to_string(),
];
let column = SimpleZeroCopyStringColumn::new(data).expect("operation should succeed");
let indices = column
.filter_views(|s| s.starts_with('a'))
.expect("operation should succeed");
assert_eq!(indices, vec![0, 3]);
let long_indices = column
.filter_views(|s| s.len() > 5)
.expect("operation should succeed");
assert_eq!(long_indices, vec![1, 2, 3]); }
#[test]
fn test_zero_copy_transformations() {
let data = vec!["Hello".to_string(), "WORLD".to_string(), "Test".to_string()];
let column = SimpleZeroCopyStringColumn::new(data).expect("operation should succeed");
let lowercase = column
.to_lowercase_optimized()
.expect("operation should succeed");
assert_eq!(
lowercase
.get(0)
.expect("operation should succeed")
.expect("operation should succeed"),
"hello"
);
assert_eq!(
lowercase
.get(1)
.expect("operation should succeed")
.expect("operation should succeed"),
"world"
);
assert_eq!(
lowercase
.get(2)
.expect("operation should succeed")
.expect("operation should succeed"),
"test"
);
let uppercase = column
.to_uppercase_optimized()
.expect("operation should succeed");
assert_eq!(
uppercase
.get(0)
.expect("operation should succeed")
.expect("operation should succeed"),
"HELLO"
);
assert_eq!(
uppercase
.get(1)
.expect("operation should succeed")
.expect("operation should succeed"),
"WORLD"
);
assert_eq!(
uppercase
.get(2)
.expect("operation should succeed")
.expect("operation should succeed"),
"TEST"
);
}
#[test]
fn test_shared_pool() {
let pool = Arc::new(SimpleUnifiedStringPool::new());
let data1 = vec!["shared".to_string(), "pool".to_string()];
let data2 = vec!["test".to_string(), "shared".to_string()];
let column1 = SimpleZeroCopyStringColumn::with_shared_pool(data1, Arc::clone(&pool))
.expect("operation should succeed");
let column2 = SimpleZeroCopyStringColumn::with_shared_pool(data2, Arc::clone(&pool))
.expect("operation should succeed");
let stats = pool.stats().expect("operation should succeed");
assert_eq!(stats.unique_strings, 3);
assert_eq!(
column1
.get(0)
.expect("operation should succeed")
.expect("operation should succeed"),
"shared"
);
assert_eq!(
column1
.get(1)
.expect("operation should succeed")
.expect("operation should succeed"),
"pool"
);
assert_eq!(
column2
.get(0)
.expect("operation should succeed")
.expect("operation should succeed"),
"test"
);
assert_eq!(
column2
.get(1)
.expect("operation should succeed")
.expect("operation should succeed"),
"shared"
);
}
#[test]
fn test_concatenation() {
let data1 = vec!["hello".to_string(), "world".to_string()];
let data2 = vec!["there".to_string(), "test".to_string()];
let column1 = SimpleZeroCopyStringColumn::new(data1).expect("operation should succeed");
let column2 = SimpleZeroCopyStringColumn::new(data2).expect("operation should succeed");
let concatenated = column1
.concat_with(&column2, " ")
.expect("operation should succeed");
assert_eq!(
concatenated
.get(0)
.expect("operation should succeed")
.expect("operation should succeed"),
"hello there"
);
assert_eq!(
concatenated
.get(1)
.expect("operation should succeed")
.expect("operation should succeed"),
"world test"
);
}
#[test]
fn test_with_nulls() {
let data = vec!["hello".to_string(), "world".to_string(), "test".to_string()];
let nulls = vec![false, true, false];
let column =
SimpleZeroCopyStringColumn::with_nulls(data, nulls).expect("operation should succeed");
assert_eq!(
column
.get(0)
.expect("operation should succeed")
.expect("operation should succeed"),
"hello"
);
assert!(column.get(1).expect("operation should succeed").is_none()); assert_eq!(
column
.get(2)
.expect("operation should succeed")
.expect("operation should succeed"),
"test"
);
}
}