#include <iostream>
#include <memory>
#include <vector>
#include <random>
#include <algorithm>
#include <stdexcept>
using namespace std;
#include "bm.h"
#include "bmtimer.h"
#include "bmsparsevec.h"
#include "bmsparsevec_compr.h"
#include "bmdbg.h"
#include "bmundef.h"
const unsigned test_size = 250000000; const unsigned sampling_interval = 2500;
typedef bm::bvector<> bvector_type;
typedef bvector_type::size_type bv_size_type;
typedef bm::sparse_vector<unsigned, bvector_type> sparse_vector_u32;
typedef bm::rsc_sparse_vector<unsigned, sparse_vector_u32> rsc_sparse_vector_u32;
typedef std::vector<std::pair<bv_size_type, bv_size_type> > bv_ranges_vector;
bm::chrono_taker::duration_map_type timing_map;
static
void generate_test_data(rsc_sparse_vector_u32& csv, unsigned size)
{
BM_DECLARE_TEMP_BLOCK(tb)
unsigned cnt = 0;
{
auto bit = csv.get_back_inserter(); for (unsigned i = 0; i < size/3; )
{
bit = cnt++;
unsigned nc = (unsigned)(rand() % 16);
bit.add_null(nc);
i+=nc+1;
} bit.flush();
}
{
auto bit = csv.get_back_inserter(); bit.add_null(size/3);
for (unsigned i = 0; i < size; )
{
bit = cnt++;
unsigned nc = (unsigned)(rand() % 16);
bit.add_null(nc);
i+=nc+1;
} bit.flush();
}
csv.optimize(tb); csv.sync(); }
static
void generate_access_samples(std::vector<bvector_type::size_type> &sample_vec,
unsigned size)
{
for (unsigned i = 0; i < size; )
{
unsigned sample = (unsigned)(rand() % 256); sample_vec.push_back(sample);
i += sample;
}
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(sample_vec.begin(), sample_vec.end(), g);
}
static
void compute_historgam(sparse_vector_u32& hist_sv,
const rsc_sparse_vector_u32& csv,
sparse_vector_u32::size_type sampling_size)
{
assert(sampling_size);
assert(sampling_size < csv.size());
BM_DECLARE_TEMP_BLOCK(tb)
sparse_vector_u32::size_type from = 0;
sparse_vector_u32::size_type to = sampling_size-1;
const rsc_sparse_vector_u32::bvector_type* bv_null = csv.get_null_bvector();
assert(bv_null);
{
auto bit = hist_sv.get_back_inserter();
auto sz = csv.size();
do
{
auto cnt = bv_null->count_range(from, to); bit = cnt;
from += sampling_size;
to = from + sampling_size - 1;
} while (from < sz);
bit.flush();
}
hist_sv.optimize(tb); }
static
void compute_rsc_historgam(rsc_sparse_vector_u32& hist_rsc,
const rsc_sparse_vector_u32& csv,
sparse_vector_u32::size_type sampling_size)
{
assert(sampling_size);
assert(sampling_size < csv.size());
{
sparse_vector_u32 hist_sv(bm::use_null);
rsc_sparse_vector_u32::size_type from = 0;
rsc_sparse_vector_u32::size_type to = sampling_size-1;
const rsc_sparse_vector_u32::bvector_type* bv_null = csv.get_null_bvector();
assert(bv_null);
auto sz = csv.size();
do
{
auto cnt = bv_null->count_range(from, to); hist_sv.set(from, cnt);
from += sampling_size;
to = from + sampling_size - 1;
} while (from < sz);
hist_rsc.load_from(hist_sv);
}
BM_DECLARE_TEMP_BLOCK(tb)
hist_rsc.optimize(tb);
hist_rsc.sync();
}
static
void compute_adaptive_rsc_histogram(rsc_sparse_vector_u32& hist_rsc,
const rsc_sparse_vector_u32& csv,
sparse_vector_u32::size_type sampling_size)
{
assert(sampling_size);
BM_DECLARE_TEMP_BLOCK(tb)
const bvector_type* bv_null = csv.get_null_bvector();
bv_size_type split_rank = sampling_size;
bv_ranges_vector pair_vect;
bm::rank_range_split(*bv_null, split_rank, pair_vect);
size_t sz = pair_vect.size();
for (size_t k = 0; k < sz; ++k)
{
const auto& p = pair_vect[k]; hist_rsc.push_back(p.first, 0); } hist_rsc.optimize(tb);
hist_rsc.sync();
}
static
void verify_histograms(const rsc_sparse_vector_u32& hist_rsc,
const sparse_vector_u32& hist_sv,
sparse_vector_u32::size_type sampling_size)
{
const rsc_sparse_vector_u32::bvector_type* bv_null = hist_rsc.get_null_bvector();
auto en = bv_null->get_enumerator(0);
for (;en.valid(); ++en)
{
auto idx = *en;
rsc_sparse_vector_u32::size_type sv_idx = (idx / sampling_size);
auto v1 = hist_rsc[idx];
auto v2 = hist_sv[sv_idx];
if (v1 != v2)
{
cerr << "Discrepancy at:" << idx << endl;
exit(1);
}
} }
static
unsigned long long access_bench1(const sparse_vector_u32& hist_sv,
const std::vector<bvector_type::size_type>& sample_vec,
unsigned sampling_size)
{
unsigned long long sum = 0;
for (size_t i = 0; i < sample_vec.size(); ++i)
{
auto idx = sample_vec[i];
idx = idx / sampling_size; auto v = hist_sv[idx];
sum += v;
}
return sum;
}
static
unsigned long long access_bench2(const rsc_sparse_vector_u32& hist_rsc,
const std::vector<bvector_type::size_type>& sample_vec)
{
const bvector_type* bv_null = hist_rsc.get_null_bvector();
assert(bv_null);
unsigned long long sum = 0;
for (size_t i = 0; i < sample_vec.size(); ++i)
{
auto idx = sample_vec[i];
bvector_type::size_type pos;
bool found = bv_null->find_reverse(idx, pos);
assert(found); (void)found;
auto v = hist_rsc[pos];
sum += v;
}
return sum;
}
static
void access_bench3(const rsc_sparse_vector_u32& hist_rsc,
const std::vector<bvector_type::size_type>& sample_vec,
const rsc_sparse_vector_u32& rsc_data)
{
bvector_type::size_type last;
{
const bvector_type* bv_null = rsc_data.get_null_bvector();
bool found = bv_null->find_reverse(last);
assert(found); (void)found;
}
const bvector_type* bv_null = hist_rsc.get_null_bvector();
assert(bv_null);
for (size_t i = 0; i < sample_vec.size(); ++i)
{
auto idx = sample_vec[i];
bvector_type::size_type pos_start, pos_end;
bool found = bv_null->find_reverse(idx, pos_start);
assert(found);
found = bv_null->find(idx+1, pos_end);
if (!found)
pos_end = last;
}
}
int main(void)
{
try
{
rsc_sparse_vector_u32 rsc_test;
bv_size_type data_set_size = 0;
sparse_vector_u32 hist1;
unsigned hist1_avg = 0;
rsc_sparse_vector_u32 hist2;
rsc_sparse_vector_u32 hist3;
std::vector<bvector_type::size_type> svec;
generate_test_data(rsc_test, test_size);
{
const bvector_type* bv_null = rsc_test.get_null_bvector();
assert(bv_null);
data_set_size = bv_null->count(); cout << "Data set size: " << rsc_test.size() << endl;
cout << "Number of elements/events in the data set: " << data_set_size << endl;
}
{
bm::chrono_taker tt1("01. Histogram 1 construction (SV)) ", 1, &timing_map);
compute_historgam(hist1, rsc_test, sampling_interval);
}
{
sparse_vector_u32::statistics st;
hist1.calc_stat(&st);
cout << "Histogram 1 (SV)" << endl;
cout << " size: " << hist1.size() << endl;
cout << " RAM size: " << st.memory_used << endl;
{
bm::chrono_taker tt1("05. Serialize and save the histogram 1 (SV)", 1, &timing_map);
size_t serialized_size = 0;
int res = bm::file_save_svector(hist1, "hist1.sv", &serialized_size, true);
if (res!=0)
cerr << "Failed to save!" << endl;
else
cout << " file size: " << serialized_size << endl;
}
unsigned h1_sum = 0;
auto it = hist1.begin();
auto it_end = hist1.end();
for (; it != it_end; ++it)
h1_sum += *it;
assert (h1_sum == data_set_size);
hist1_avg = h1_sum / hist1.size();
}
{
bm::chrono_taker tt1("02. Histogram 2 construction (RSC) ", 1, &timing_map);
compute_rsc_historgam(hist2, rsc_test, sampling_interval);
}
{
rsc_sparse_vector_u32::statistics st;
hist2.calc_stat(&st);
cout << "Histogram 2 (RSC)" << endl;
cout << " size: " << hist2.size() << endl;
cout << " NOT NULL count: " << hist2.get_null_bvector()->count() << endl;
cout << " RAM size: " << st.memory_used << endl;
{
bm::chrono_taker tt1("06. Serialize and save histogram 2(RSC)", 1, &timing_map);
size_t serialized_size = 0;
int res = bm::file_save_svector(hist2, "hist2.sv", &serialized_size, true);
if (res!=0)
cerr << "Failed to save!" << endl;
else
cout << " file size: " << serialized_size << endl;
}
}
{
bm::chrono_taker tt1("03. Histogram 3 (adaptive) construction (RSC) ", 1, &timing_map);
compute_adaptive_rsc_histogram(hist3, rsc_test, hist1_avg);
}
{
rsc_sparse_vector_u32::statistics st;
hist3.calc_stat(&st);
cout << "Histogram 3 adaptive (RSC)" << endl;
cout << " size: " << hist3.size() << endl;
cout << " NOT NULL count: " << hist3.get_null_bvector()->count() << endl;
cout << " RAM size: " << st.memory_used << endl;
{
bm::chrono_taker tt1("07. Serialize and save the adaptive histogram 3 (RSC)", 1, &timing_map);
size_t serialized_size = 0;
int res = bm::file_save_svector(hist3, "hist3.sv", &serialized_size, true);
if (res!=0)
cerr << "Failed to save!" << endl;
else
cout << " file size: " << serialized_size << endl;
}
}
generate_access_samples(svec, test_size);
cout << endl;
cout << "Access sample size = " << svec.size() << endl;
{
bm::chrono_taker tt1("04. Verification", 1, &timing_map);
verify_histograms(hist2, hist1, sampling_interval);
}
unsigned long long sum1(0), sum2(0);
{
bm::chrono_taker tt1("08. Random access test H1 (SV)", 1, &timing_map);
sum1 = access_bench1(hist1, svec, sampling_interval);
}
{
bm::chrono_taker tt1("09. Random access test H2 (RSC)", 1, &timing_map);
sum2 = access_bench2(hist2, svec);
}
if (sum1 != sum2) {
cerr << "Control sum discrepancy!" << endl;
return 1;
}
{
bm::chrono_taker tt1("10. Random access test H3 adaptive (RSC)", 1, &timing_map);
access_bench3(hist3, svec, rsc_test);
}
cout << endl;
bm::chrono_taker::print_duration_map(timing_map, bm::chrono_taker::ct_ops_per_sec);
}
catch(std::exception& ex)
{
std::cerr << ex.what() << std::endl;
return 1;
}
return 0;
}