#include <iostream>
#include <sstream>
#include <chrono>
#include <regex>
#include <time.h>
#include <stdio.h>
#include <vector>
#include <map>
#include <chrono>
#include <map>
#include <utility>
#include <algorithm>
#include <random>
using namespace std;
#include "bm.h"
#include "bmalgo.h"
#include "bmserial.h"
#include "bmrandom.h"
#include "bmstrsparsevec.h"
#include "bmsparsevec_algo.h"
#include "bmsparsevec_serial.h"
#include "bmalgo_similarity.h"
#include "bmsparsevec_util.h"
#include "bmdbg.h"
#include "bmtimer.h"
static
void show_help()
{
std::cerr
<< "BitMagic Range Index Search Sample (c) 2019" << std::endl
<< "-i file-name -- input set file to parse" << std::endl
<< "-svout spase vector output -- sparse vector name to save" << std::endl
<< "-svin sparse vector input -- sparse vector file name to load " << std::endl
<< "-diag -- run diagnostics" << std::endl
<< "-bench -- run benchmarks" << std::endl
<< "-timing -- collect timings" << std::endl
;
}
std::string sv_out_name;
std::string sv_in_name;
std::string i_name;
bool is_diag = false;
bool is_timing = false;
bool is_bench = false;
static
int parse_args(int argc, char *argv[])
{
for (int i = 1; i < argc; ++i)
{
std::string arg = argv[i];
if ((arg == "-h") || (arg == "--help"))
{
show_help();
return 0;
}
if (arg == "-svout" || arg == "--svout")
{
if (i + 1 < argc)
{
sv_out_name = argv[++i];
}
else
{
std::cerr << "Error: -svout requires file name" << std::endl;
return 1;
}
continue;
}
if (arg == "-svin" || arg == "--svin")
{
if (i + 1 < argc)
{
sv_in_name = argv[++i];
}
else
{
std::cerr << "Error: -svin requires file name" << std::endl;
return 1;
}
continue;
}
if (arg == "-i" || arg == "--i" )
{
if (i + 1 < argc)
{
i_name = argv[++i];
}
else
{
std::cerr << "Error: -idict requires file name" << std::endl;
return 1;
}
continue;
}
if (arg == "-diag" || arg == "--diag" || arg == "-d" || arg == "--d")
{
is_diag = true;
continue;
}
if (arg == "-timing" || arg == "--timing" || arg == "-t" || arg == "--t")
{
is_timing = true;
continue;
}
if (arg == "-bench" || arg == "--bench" || arg == "-b" || arg == "--b")
{
is_bench = true;
continue;
}
std::cerr << "Error: unknown argument: " << arg << std::endl;
return 1;
} return 0;
}
typedef bm::sparse_vector<unsigned, bm::bvector<> > sparse_vector_u32;
typedef bm::str_sparse_vector<char, bm::bvector<>, 64> str_sparse_vect;
struct RangeIndex
{
str_sparse_vect id_from_v;
sparse_vector_u32 range_from_v;
sparse_vector_u32 len_v;
str_sparse_vect id_to_v;
};
bm::chrono_taker::duration_map_type timing_map;
static
int load_range_idx(const std::string& fname, RangeIndex& range_idx)
{
bm::chrono_taker tt1("1. parse input data ", 1, &timing_map);
std::ifstream fin(fname.c_str(), std::ios::in);
if (!fin.good())
return -1;
std::string line;
std::regex reg("\t|, |\\s+"); std::sregex_token_iterator it_end;
string trim_chars(", ");
str_sparse_vect::back_insert_iterator id_from_bi = range_idx.id_from_v.get_back_inserter();
sparse_vector_u32::back_insert_iterator range_from_bi = range_idx.range_from_v.get_back_inserter();
sparse_vector_u32::back_insert_iterator len_bi = range_idx.len_v.get_back_inserter();
str_sparse_vect::back_insert_iterator id_to_bi = range_idx.id_to_v.get_back_inserter();
for (unsigned i = 0; std::getline(fin, line); ++i)
{
if (line.empty())
continue;
std::sregex_token_iterator it(line.begin(), line.end(), reg, -1);
std::vector<std::string> line_vec(it, it_end);
if (line_vec.empty() || line_vec.size() != 4)
continue;
try
{
const string& id_from = line_vec.at(0);
const string& r_from = line_vec.at(1);
const string& r_to = line_vec.at(2);
const string& id_to = line_vec.at(3);
size_t p;
unsigned from = unsigned(std::stoul(r_from, &p));
unsigned to = unsigned(std::stoul(r_to, &p));
if (to < from)
{
std::cerr << "Incorrect index for:" << std::endl;
std::cerr << id_from << " " << r_from << " " << r_to << " " << id_to << std::endl;
continue; }
id_from_bi.add(id_from.c_str());
range_from_bi.add(from);
unsigned len = to - from; len_bi.add(len);
id_to_bi.add(id_to.c_str());
}
catch (std::exception&)
{
}
if ((i & 0xFFFF) == 0)
{
cout << "\rReading input file: " << i << flush;
}
}
id_from_bi.flush();
range_from_bi.flush();
len_bi.flush();
id_to_bi.flush();
cout << endl;
return 0;
}
static
void optimize_range_idx(RangeIndex& range_idx)
{
bm::chrono_taker tt1("2. optimize index ", 1, &timing_map);
{
str_sparse_vect v_tmp;
v_tmp.remap_from(range_idx.id_from_v);
range_idx.id_from_v.swap(v_tmp);
}
{
str_sparse_vect v_tmp;
v_tmp.remap_from(range_idx.id_to_v);
range_idx.id_to_v.swap(v_tmp);
}
BM_DECLARE_TEMP_BLOCK(tb)
range_idx.id_from_v.optimize(tb);
range_idx.range_from_v.optimize(tb);
range_idx.len_v.optimize(tb);
range_idx.id_to_v.optimize(tb);
}
template<typename SV>
int save_index_vector(const string& fn, const SV& sv)
{
bm::sparse_vector_serial_layout<SV> sv_lay;
BM_DECLARE_TEMP_BLOCK(tb)
bm::sparse_vector_serialize(sv, sv_lay, tb);
{
std::ofstream fout(fn.c_str(), std::ios::binary);
if (!fout.good())
return -1;
const char* buf = (char*)sv_lay.buf();
fout.write(buf, (unsigned)sv_lay.size());
if (!fout.good())
return -1;
fout.close();
}
return 0;
}
static
int save_range_idx(const string& fname, const RangeIndex& range_idx)
{
bm::chrono_taker tt1("3. save sparse vectors", 1, &timing_map);
int res;
{
std::string fn = fname;
fn.append(".id_from.sv");
res = save_index_vector(fn, range_idx.id_from_v);
if (res)
return res;
}
{
std::string fn = fname;
fn.append(".range_from.sv");
res = save_index_vector(fn, range_idx.range_from_v);
if (res)
return res;
}
{
std::string fn = fname;
fn.append(".len.sv");
res = save_index_vector(fn, range_idx.len_v);
if (res)
return res;
}
{
std::string fn = fname;
fn.append(".id_to.sv");
res = save_index_vector(fn, range_idx.id_to_v);
if (res)
return res;
}
return res;
}
static
int file_load_range_idx(const string& fname, RangeIndex& range_idx)
{
bm::chrono_taker tt1("4. load sparse vector", 1, &timing_map);
int res;
{
std::string fn = fname;
fn.append(".id_from.sv");
res = bm::file_load_svector(range_idx.id_from_v, fn);
if (res)
return res;
}
{
std::string fn = fname;
fn.append(".range_from.sv");
res = bm::file_load_svector(range_idx.range_from_v, fn);
if (res)
return res;
}
{
std::string fn = fname;
fn.append(".len.sv");
res = bm::file_load_svector(range_idx.len_v, fn);
if (res)
return res;
}
{
std::string fn = fname;
fn.append(".id_to.sv");
res = bm::file_load_svector(range_idx.id_to_v, fn);
if (res)
return res;
}
return res;
}
const unsigned benchmark_max = 15000;
static
void run_search(const RangeIndex& range_idx, const string& search_key)
{
bm::sparse_vector_scanner<str_sparse_vect> scanner;
scanner.bind(range_idx.id_from_v, true);
{
bm::chrono_taker tt1("5. bm::sparse_vector_scanner<> binary search", benchmark_max, &timing_map);
str_sparse_vect::size_type pos;
bool found = scanner.bfind_eq_str(search_key.c_str(), pos);
string id_from, id_to;
unsigned from, to;
for (unsigned i = 0; i < benchmark_max; ++i)
{
while (found)
{
range_idx.id_from_v.get(pos, id_from);
if (id_from != search_key)
break;
from = range_idx.range_from_v[pos];
to = range_idx.len_v[pos];
to += from;
range_idx.id_to_v.get(pos, id_to);
++pos;
found = pos < range_idx.id_from_v.size();
} }
}
}
int main(int argc, char *argv[])
{
if (argc < 3)
{
show_help();
return 1;
}
RangeIndex r_idx;
try
{
auto ret = parse_args(argc, argv);
if (ret != 0)
{
return ret;
}
if (!i_name.empty())
{
auto res = load_range_idx(i_name, r_idx);
if (res != 0)
{
return res;
}
optimize_range_idx(r_idx);
}
if (!sv_out_name.empty())
{
save_range_idx(sv_out_name, r_idx);
}
if (!sv_in_name.empty())
{
file_load_range_idx(sv_in_name, r_idx);
}
if (is_diag)
{
std::cout << "ID_FROM:" << std::endl;
print_svector_stat(r_idx.id_from_v, false);
std::cout << "RANGE_FROM:" << std::endl;
print_svector_stat(r_idx.range_from_v, false);
std::cout << "LEN:" << std::endl;
print_svector_stat(r_idx.len_v, false);
std::cout << "ID_TO:" << std::endl;
print_svector_stat(r_idx.id_to_v, false);
}
run_search(r_idx, "CUU0");
if (is_timing) {
std::cout << std::endl << "Performance:" << std::endl;
bm::chrono_taker::print_duration_map(timing_map, bm::chrono_taker::ct_time);
}
}
catch (std::exception& ex)
{
std::cerr << "Error:" << ex.what() << std::endl;
return 1;
}
return 0;
}