#include "table/block_based_table_reader.h"
#include <string>
#include <utility>
#include "db/dbformat.h"
#include "rocksdb/cache.h"
#include "rocksdb/comparator.h"
#include "rocksdb/env.h"
#include "rocksdb/filter_policy.h"
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "rocksdb/statistics.h"
#include "rocksdb/table.h"
#include "rocksdb/table_properties.h"
#include "table/block.h"
#include "table/filter_block.h"
#include "table/block_based_filter_block.h"
#include "table/block_based_table_factory.h"
#include "table/full_filter_block.h"
#include "table/block_hash_index.h"
#include "table/block_prefix_index.h"
#include "table/format.h"
#include "table/internal_iterator.h"
#include "table/meta_blocks.h"
#include "table/two_level_iterator.h"
#include "table/get_context.h"
#include "util/coding.h"
#include "util/file_reader_writer.h"
#include "util/perf_context_imp.h"
#include "util/stop_watch.h"
#include "util/string_util.h"
namespace rocksdb {
extern const uint64_t kBlockBasedTableMagicNumber;
extern const std::string kHashIndexPrefixesBlock;
extern const std::string kHashIndexPrefixesMetadataBlock;
using std::unique_ptr;
typedef BlockBasedTable::IndexReader IndexReader;
namespace {
const size_t kMaxCacheKeyPrefixSize __attribute__((unused)) =
kMaxVarint64Length * 3 + 1;
Status ReadBlockFromFile(RandomAccessFileReader* file, const Footer& footer,
const ReadOptions& options, const BlockHandle& handle,
std::unique_ptr<Block>* result, Env* env,
bool do_uncompress = true) {
BlockContents contents;
Status s = ReadBlockContents(file, footer, options, handle, &contents, env,
do_uncompress);
if (s.ok()) {
result->reset(new Block(std::move(contents)));
}
return s;
}
template <class ResourceType>
void DeleteHeldResource(void* arg, void* ignored) {
delete reinterpret_cast<ResourceType*>(arg);
}
template <class Entry>
void DeleteCachedEntry(const Slice& key, void* value) {
auto entry = reinterpret_cast<Entry*>(value);
delete entry;
}
void ReleaseCachedEntry(void* arg, void* h) {
Cache* cache = reinterpret_cast<Cache*>(arg);
Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
cache->Release(handle);
}
Slice GetCacheKey(const char* cache_key_prefix, size_t cache_key_prefix_size,
const BlockHandle& handle, char* cache_key) {
assert(cache_key != nullptr);
assert(cache_key_prefix_size != 0);
assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize);
memcpy(cache_key, cache_key_prefix, cache_key_prefix_size);
char* end =
EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset());
return Slice(cache_key, static_cast<size_t>(end - cache_key));
}
Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
Tickers block_cache_miss_ticker,
Tickers block_cache_hit_ticker,
Statistics* statistics) {
auto cache_handle = block_cache->Lookup(key);
if (cache_handle != nullptr) {
PERF_COUNTER_ADD(block_cache_hit_count, 1);
RecordTick(statistics, BLOCK_CACHE_HIT);
RecordTick(statistics, BLOCK_CACHE_BYTES_READ,
block_cache->GetUsage(cache_handle));
RecordTick(statistics, block_cache_hit_ticker);
} else {
RecordTick(statistics, BLOCK_CACHE_MISS);
RecordTick(statistics, block_cache_miss_ticker);
}
return cache_handle;
}
}
class BlockBasedTable::IndexReader {
public:
explicit IndexReader(const Comparator* comparator)
: comparator_(comparator) {}
virtual ~IndexReader() {}
virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
bool total_order_seek = true) = 0;
virtual size_t size() const = 0;
virtual size_t usable_size() const = 0;
virtual size_t ApproximateMemoryUsage() const = 0;
protected:
const Comparator* comparator_;
};
class BinarySearchIndexReader : public IndexReader {
public:
static Status Create(RandomAccessFileReader* file, const Footer& footer,
const BlockHandle& index_handle, Env* env,
const Comparator* comparator,
IndexReader** index_reader) {
std::unique_ptr<Block> index_block;
auto s = ReadBlockFromFile(file, footer, ReadOptions(), index_handle,
&index_block, env);
if (s.ok()) {
*index_reader =
new BinarySearchIndexReader(comparator, std::move(index_block));
}
return s;
}
virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
bool dont_care = true) override {
return index_block_->NewIterator(comparator_, iter, true);
}
virtual size_t size() const override { return index_block_->size(); }
virtual size_t usable_size() const override {
return index_block_->usable_size();
}
virtual size_t ApproximateMemoryUsage() const override {
assert(index_block_);
return index_block_->ApproximateMemoryUsage();
}
private:
BinarySearchIndexReader(const Comparator* comparator,
std::unique_ptr<Block>&& index_block)
: IndexReader(comparator), index_block_(std::move(index_block)) {
assert(index_block_ != nullptr);
}
std::unique_ptr<Block> index_block_;
};
class HashIndexReader : public IndexReader {
public:
static Status Create(const SliceTransform* hash_key_extractor,
const Footer& footer, RandomAccessFileReader* file,
Env* env, const Comparator* comparator,
const BlockHandle& index_handle,
InternalIterator* meta_index_iter,
IndexReader** index_reader,
bool hash_index_allow_collision) {
std::unique_ptr<Block> index_block;
auto s = ReadBlockFromFile(file, footer, ReadOptions(), index_handle,
&index_block, env);
if (!s.ok()) {
return s;
}
auto new_index_reader =
new HashIndexReader(comparator, std::move(index_block));
*index_reader = new_index_reader;
BlockHandle prefixes_handle;
s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesBlock,
&prefixes_handle);
if (!s.ok()) {
return Status::OK();
}
BlockHandle prefixes_meta_handle;
s = FindMetaBlock(meta_index_iter, kHashIndexPrefixesMetadataBlock,
&prefixes_meta_handle);
if (!s.ok()) {
return Status::OK();
}
BlockContents prefixes_contents;
s = ReadBlockContents(file, footer, ReadOptions(), prefixes_handle,
&prefixes_contents, env, true );
if (!s.ok()) {
return s;
}
BlockContents prefixes_meta_contents;
s = ReadBlockContents(file, footer, ReadOptions(), prefixes_meta_handle,
&prefixes_meta_contents, env,
true );
if (!s.ok()) {
return Status::OK();
}
if (!hash_index_allow_collision) {
BlockHashIndex* hash_index = nullptr;
s = CreateBlockHashIndex(hash_key_extractor,
prefixes_contents.data,
prefixes_meta_contents.data,
&hash_index);
if (s.ok()) {
new_index_reader->index_block_->SetBlockHashIndex(hash_index);
new_index_reader->OwnPrefixesContents(std::move(prefixes_contents));
}
} else {
BlockPrefixIndex* prefix_index = nullptr;
s = BlockPrefixIndex::Create(hash_key_extractor,
prefixes_contents.data,
prefixes_meta_contents.data,
&prefix_index);
if (s.ok()) {
new_index_reader->index_block_->SetBlockPrefixIndex(prefix_index);
}
}
return Status::OK();
}
virtual InternalIterator* NewIterator(BlockIter* iter = nullptr,
bool total_order_seek = true) override {
return index_block_->NewIterator(comparator_, iter, total_order_seek);
}
virtual size_t size() const override { return index_block_->size(); }
virtual size_t usable_size() const override {
return index_block_->usable_size();
}
virtual size_t ApproximateMemoryUsage() const override {
assert(index_block_);
return index_block_->ApproximateMemoryUsage() +
prefixes_contents_.data.size();
}
private:
HashIndexReader(const Comparator* comparator,
std::unique_ptr<Block>&& index_block)
: IndexReader(comparator), index_block_(std::move(index_block)) {
assert(index_block_ != nullptr);
}
~HashIndexReader() {
}
void OwnPrefixesContents(BlockContents&& prefixes_contents) {
prefixes_contents_ = std::move(prefixes_contents);
}
std::unique_ptr<Block> index_block_;
BlockContents prefixes_contents_;
};
struct BlockBasedTable::Rep {
Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options,
const BlockBasedTableOptions& _table_opt,
const InternalKeyComparator& _internal_comparator, bool skip_filters)
: ioptions(_ioptions),
env_options(_env_options),
table_options(_table_opt),
filter_policy(skip_filters ? nullptr : _table_opt.filter_policy.get()),
internal_comparator(_internal_comparator),
filter_type(FilterType::kNoFilter),
whole_key_filtering(_table_opt.whole_key_filtering),
prefix_filtering(true) {}
const ImmutableCFOptions& ioptions;
const EnvOptions& env_options;
const BlockBasedTableOptions& table_options;
const FilterPolicy* const filter_policy;
const InternalKeyComparator& internal_comparator;
Status status;
unique_ptr<RandomAccessFileReader> file;
char cache_key_prefix[kMaxCacheKeyPrefixSize];
size_t cache_key_prefix_size = 0;
char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
size_t compressed_cache_key_prefix_size = 0;
Footer footer;
unique_ptr<IndexReader> index_reader;
unique_ptr<FilterBlockReader> filter;
enum class FilterType {
kNoFilter,
kFullFilter,
kBlockFilter,
};
FilterType filter_type;
BlockHandle filter_handle;
std::shared_ptr<const TableProperties> table_properties;
BlockBasedTableOptions::IndexType index_type;
bool hash_index_allow_collision;
bool whole_key_filtering;
bool prefix_filtering;
unique_ptr<SliceTransform> internal_prefix_transform;
};
BlockBasedTable::~BlockBasedTable() {
delete rep_;
}
template <class TValue>
struct BlockBasedTable::CachableEntry {
CachableEntry(TValue* _value, Cache::Handle* _cache_handle)
: value(_value), cache_handle(_cache_handle) {}
CachableEntry() : CachableEntry(nullptr, nullptr) {}
void Release(Cache* cache) {
if (cache_handle) {
cache->Release(cache_handle);
value = nullptr;
cache_handle = nullptr;
}
}
TValue* value = nullptr;
Cache::Handle* cache_handle = nullptr;
};
void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) {
assert(kMaxCacheKeyPrefixSize >= 10);
rep->cache_key_prefix_size = 0;
rep->compressed_cache_key_prefix_size = 0;
if (rep->table_options.block_cache != nullptr) {
GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(),
&rep->cache_key_prefix[0], &rep->cache_key_prefix_size);
}
if (rep->table_options.block_cache_compressed != nullptr) {
GenerateCachePrefix(rep->table_options.block_cache_compressed.get(),
rep->file->file(), &rep->compressed_cache_key_prefix[0],
&rep->compressed_cache_key_prefix_size);
}
}
void BlockBasedTable::GenerateCachePrefix(Cache* cc,
RandomAccessFile* file, char* buffer, size_t* size) {
*size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
if (*size == 0) {
char* end = EncodeVarint64(buffer, cc->NewId());
*size = static_cast<size_t>(end - buffer);
}
}
void BlockBasedTable::GenerateCachePrefix(Cache* cc,
WritableFile* file, char* buffer, size_t* size) {
*size = file->GetUniqueId(buffer, kMaxCacheKeyPrefixSize);
if (*size == 0) {
char* end = EncodeVarint64(buffer, cc->NewId());
*size = static_cast<size_t>(end - buffer);
}
}
namespace {
bool IsFeatureSupported(const TableProperties& table_properties,
const std::string& user_prop_name, Logger* info_log) {
auto& props = table_properties.user_collected_properties;
auto pos = props.find(user_prop_name);
if (pos != props.end()) {
if (pos->second == kPropFalse) {
return false;
} else if (pos->second != kPropTrue) {
Log(InfoLogLevel::WARN_LEVEL, info_log,
"Property %s has invalidate value %s", user_prop_name.c_str(),
pos->second.c_str());
}
}
return true;
}
}
Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions,
const EnvOptions& env_options,
const BlockBasedTableOptions& table_options,
const InternalKeyComparator& internal_comparator,
unique_ptr<RandomAccessFileReader>&& file,
uint64_t file_size,
unique_ptr<TableReader>* table_reader,
const bool prefetch_index_and_filter,
const bool skip_filters) {
table_reader->reset();
Footer footer;
auto s = ReadFooterFromFile(file.get(), file_size, &footer,
kBlockBasedTableMagicNumber);
if (!s.ok()) {
return s;
}
if (!BlockBasedTableSupportedVersion(footer.version())) {
return Status::Corruption(
"Unknown Footer version. Maybe this file was created with newer "
"version of RocksDB?");
}
Rep* rep = new BlockBasedTable::Rep(ioptions, env_options, table_options,
internal_comparator, skip_filters);
rep->file = std::move(file);
rep->footer = footer;
rep->index_type = table_options.index_type;
rep->hash_index_allow_collision = table_options.hash_index_allow_collision;
SetupCacheKeyPrefix(rep);
unique_ptr<BlockBasedTable> new_table(new BlockBasedTable(rep));
std::unique_ptr<Block> meta;
std::unique_ptr<InternalIterator> meta_iter;
s = ReadMetaBlock(rep, &meta, &meta_iter);
if (!s.ok()) {
return s;
}
if (rep->filter_policy) {
for (auto prefix : {kFullFilterBlockPrefix, kFilterBlockPrefix}) {
std::string filter_block_key = prefix;
filter_block_key.append(rep->filter_policy->Name());
if (FindMetaBlock(meta_iter.get(), filter_block_key, &rep->filter_handle)
.ok()) {
rep->filter_type = (prefix == kFullFilterBlockPrefix)
? Rep::FilterType::kFullFilter
: Rep::FilterType::kBlockFilter;
break;
}
}
}
bool found_properties_block = true;
s = SeekToPropertiesBlock(meta_iter.get(), &found_properties_block);
if (!s.ok()) {
Log(InfoLogLevel::WARN_LEVEL, rep->ioptions.info_log,
"Cannot seek to properties block from file: %s",
s.ToString().c_str());
} else if (found_properties_block) {
s = meta_iter->status();
TableProperties* table_properties = nullptr;
if (s.ok()) {
s = ReadProperties(meta_iter->value(), rep->file.get(), rep->footer,
rep->ioptions.env, rep->ioptions.info_log,
&table_properties);
}
if (!s.ok()) {
Log(InfoLogLevel::WARN_LEVEL, rep->ioptions.info_log,
"Encountered error while reading data from properties "
"block %s", s.ToString().c_str());
} else {
rep->table_properties.reset(table_properties);
}
} else {
Log(InfoLogLevel::ERROR_LEVEL, rep->ioptions.info_log,
"Cannot find Properties block from file.");
}
if (rep->table_properties) {
rep->whole_key_filtering &=
IsFeatureSupported(*(rep->table_properties),
BlockBasedTablePropertyNames::kWholeKeyFiltering,
rep->ioptions.info_log);
rep->prefix_filtering &= IsFeatureSupported(
*(rep->table_properties),
BlockBasedTablePropertyNames::kPrefixFiltering, rep->ioptions.info_log);
}
if (prefetch_index_and_filter) {
if (table_options.cache_index_and_filter_blocks) {
assert(table_options.block_cache != nullptr);
unique_ptr<InternalIterator> iter(
new_table->NewIndexIterator(ReadOptions()));
s = iter->status();
if (s.ok()) {
auto filter_entry = new_table->GetFilter();
filter_entry.Release(table_options.block_cache.get());
}
} else {
IndexReader* index_reader = nullptr;
s = new_table->CreateIndexReader(&index_reader, meta_iter.get());
if (s.ok()) {
rep->index_reader.reset(index_reader);
if (rep->filter_policy) {
rep->filter.reset(ReadFilter(rep, nullptr));
}
} else {
delete index_reader;
}
}
}
if (s.ok()) {
*table_reader = std::move(new_table);
}
return s;
}
void BlockBasedTable::SetupForCompaction() {
switch (rep_->ioptions.access_hint_on_compaction_start) {
case Options::NONE:
break;
case Options::NORMAL:
rep_->file->file()->Hint(RandomAccessFile::NORMAL);
break;
case Options::SEQUENTIAL:
rep_->file->file()->Hint(RandomAccessFile::SEQUENTIAL);
break;
case Options::WILLNEED:
rep_->file->file()->Hint(RandomAccessFile::WILLNEED);
break;
default:
assert(false);
}
compaction_optimized_ = true;
}
std::shared_ptr<const TableProperties> BlockBasedTable::GetTableProperties()
const {
return rep_->table_properties;
}
size_t BlockBasedTable::ApproximateMemoryUsage() const {
size_t usage = 0;
if (rep_->filter) {
usage += rep_->filter->ApproximateMemoryUsage();
}
if (rep_->index_reader) {
usage += rep_->index_reader->ApproximateMemoryUsage();
}
return usage;
}
Status BlockBasedTable::ReadMetaBlock(Rep* rep,
std::unique_ptr<Block>* meta_block,
std::unique_ptr<InternalIterator>* iter) {
std::unique_ptr<Block> meta;
Status s = ReadBlockFromFile(
rep->file.get(),
rep->footer,
ReadOptions(),
rep->footer.metaindex_handle(),
&meta,
rep->ioptions.env);
if (!s.ok()) {
Log(InfoLogLevel::ERROR_LEVEL, rep->ioptions.info_log,
"Encountered error while reading data from properties"
" block %s", s.ToString().c_str());
return s;
}
*meta_block = std::move(meta);
iter->reset(meta_block->get()->NewIterator(BytewiseComparator()));
return Status::OK();
}
Status BlockBasedTable::GetDataBlockFromCache(
const Slice& block_cache_key, const Slice& compressed_block_cache_key,
Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics,
const ReadOptions& read_options,
BlockBasedTable::CachableEntry<Block>* block, uint32_t format_version) {
Status s;
Block* compressed_block = nullptr;
Cache::Handle* block_cache_compressed_handle = nullptr;
if (block_cache != nullptr) {
block->cache_handle =
GetEntryFromCache(block_cache, block_cache_key, BLOCK_CACHE_DATA_MISS,
BLOCK_CACHE_DATA_HIT, statistics);
if (block->cache_handle != nullptr) {
block->value =
reinterpret_cast<Block*>(block_cache->Value(block->cache_handle));
return s;
}
}
assert(block->cache_handle == nullptr && block->value == nullptr);
if (block_cache_compressed == nullptr) {
return s;
}
assert(!compressed_block_cache_key.empty());
block_cache_compressed_handle =
block_cache_compressed->Lookup(compressed_block_cache_key);
if (block_cache_compressed_handle == nullptr) {
RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
return s;
}
RecordTick(statistics, BLOCK_CACHE_COMPRESSED_HIT);
compressed_block = reinterpret_cast<Block*>(
block_cache_compressed->Value(block_cache_compressed_handle));
assert(compressed_block->compression_type() != kNoCompression);
BlockContents contents;
s = UncompressBlockContents(compressed_block->data(),
compressed_block->size(), &contents,
format_version);
if (s.ok()) {
block->value = new Block(std::move(contents)); assert(block->value->compression_type() == kNoCompression);
if (block_cache != nullptr && block->value->cachable() &&
read_options.fill_cache) {
block->cache_handle = block_cache->Insert(block_cache_key, block->value,
block->value->usable_size(),
&DeleteCachedEntry<Block>);
assert(reinterpret_cast<Block*>(
block_cache->Value(block->cache_handle)) == block->value);
}
}
block_cache_compressed->Release(block_cache_compressed_handle);
return s;
}
Status BlockBasedTable::PutDataBlockToCache(
const Slice& block_cache_key, const Slice& compressed_block_cache_key,
Cache* block_cache, Cache* block_cache_compressed,
const ReadOptions& read_options, Statistics* statistics,
CachableEntry<Block>* block, Block* raw_block, uint32_t format_version) {
assert(raw_block->compression_type() == kNoCompression ||
block_cache_compressed != nullptr);
Status s;
BlockContents contents;
if (raw_block->compression_type() != kNoCompression) {
s = UncompressBlockContents(raw_block->data(), raw_block->size(), &contents,
format_version);
}
if (!s.ok()) {
delete raw_block;
return s;
}
if (raw_block->compression_type() != kNoCompression) {
block->value = new Block(std::move(contents)); } else {
block->value = raw_block;
raw_block = nullptr;
}
if (block_cache_compressed != nullptr && raw_block != nullptr &&
raw_block->cachable()) {
auto cache_handle = block_cache_compressed->Insert(
compressed_block_cache_key, raw_block, raw_block->usable_size(),
&DeleteCachedEntry<Block>);
block_cache_compressed->Release(cache_handle);
RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS);
raw_block = nullptr;
}
delete raw_block;
assert((block->value->compression_type() == kNoCompression));
if (block_cache != nullptr && block->value->cachable()) {
block->cache_handle = block_cache->Insert(block_cache_key, block->value,
block->value->usable_size(),
&DeleteCachedEntry<Block>);
RecordTick(statistics, BLOCK_CACHE_ADD);
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE,
block->value->usable_size());
assert(reinterpret_cast<Block*>(block_cache->Value(block->cache_handle)) ==
block->value);
}
return s;
}
FilterBlockReader* BlockBasedTable::ReadFilter(Rep* rep, size_t* filter_size) {
if (rep->filter_type == Rep::FilterType::kNoFilter) {
return nullptr;
}
BlockContents block;
if (!ReadBlockContents(rep->file.get(), rep->footer, ReadOptions(),
rep->filter_handle, &block, rep->ioptions.env,
false).ok()) {
return nullptr;
}
if (filter_size) {
*filter_size = block.data.size();
}
assert(rep->filter_policy);
if (rep->filter_type == Rep::FilterType::kBlockFilter) {
return new BlockBasedFilterBlockReader(
rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr,
rep->table_options, rep->whole_key_filtering, std::move(block));
} else if (rep->filter_type == Rep::FilterType::kFullFilter) {
auto filter_bits_reader =
rep->filter_policy->GetFilterBitsReader(block.data);
if (filter_bits_reader != nullptr) {
return new FullFilterBlockReader(
rep->prefix_filtering ? rep->ioptions.prefix_extractor : nullptr,
rep->whole_key_filtering, std::move(block), filter_bits_reader);
}
}
assert(false);
return nullptr;
}
BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
bool no_io) const {
if (!rep_->table_options.cache_index_and_filter_blocks) {
return {rep_->filter.get(), nullptr };
}
PERF_TIMER_GUARD(read_filter_block_nanos);
Cache* block_cache = rep_->table_options.block_cache.get();
if (rep_->filter_policy == nullptr ||
block_cache == nullptr ) {
return {nullptr , nullptr };
}
char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
rep_->footer.metaindex_handle(),
cache_key);
Statistics* statistics = rep_->ioptions.statistics;
auto cache_handle =
GetEntryFromCache(block_cache, key, BLOCK_CACHE_FILTER_MISS,
BLOCK_CACHE_FILTER_HIT, statistics);
FilterBlockReader* filter = nullptr;
if (cache_handle != nullptr) {
filter = reinterpret_cast<FilterBlockReader*>(
block_cache->Value(cache_handle));
} else if (no_io) {
return CachableEntry<FilterBlockReader>();
} else {
size_t filter_size = 0;
filter = ReadFilter(rep_, &filter_size);
if (filter != nullptr) {
assert(filter_size > 0);
cache_handle = block_cache->Insert(key, filter, filter_size,
&DeleteCachedEntry<FilterBlockReader>);
RecordTick(statistics, BLOCK_CACHE_ADD);
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, filter_size);
}
}
return { filter, cache_handle };
}
InternalIterator* BlockBasedTable::NewIndexIterator(
const ReadOptions& read_options, BlockIter* input_iter) {
if (rep_->index_reader) {
return rep_->index_reader->NewIterator(
input_iter, read_options.total_order_seek);
}
PERF_TIMER_GUARD(read_index_block_nanos);
bool no_io = read_options.read_tier == kBlockCacheTier;
Cache* block_cache = rep_->table_options.block_cache.get();
char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
rep_->footer.index_handle(), cache_key);
Statistics* statistics = rep_->ioptions.statistics;
auto cache_handle =
GetEntryFromCache(block_cache, key, BLOCK_CACHE_INDEX_MISS,
BLOCK_CACHE_INDEX_HIT, statistics);
if (cache_handle == nullptr && no_io) {
if (input_iter != nullptr) {
input_iter->SetStatus(Status::Incomplete("no blocking io"));
return input_iter;
} else {
return NewErrorInternalIterator(Status::Incomplete("no blocking io"));
}
}
IndexReader* index_reader = nullptr;
if (cache_handle != nullptr) {
index_reader =
reinterpret_cast<IndexReader*>(block_cache->Value(cache_handle));
} else {
Status s;
s = CreateIndexReader(&index_reader);
if (!s.ok()) {
assert(index_reader == nullptr);
if (input_iter != nullptr) {
input_iter->SetStatus(s);
return input_iter;
} else {
return NewErrorInternalIterator(s);
}
}
cache_handle =
block_cache->Insert(key, index_reader, index_reader->usable_size(),
&DeleteCachedEntry<IndexReader>);
RecordTick(statistics, BLOCK_CACHE_ADD);
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE,
index_reader->usable_size());
}
assert(cache_handle);
auto* iter = index_reader->NewIterator(
input_iter, read_options.total_order_seek);
iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle);
return iter;
}
InternalIterator* BlockBasedTable::NewDataBlockIterator(
Rep* rep, const ReadOptions& ro, const Slice& index_value,
BlockIter* input_iter) {
PERF_TIMER_GUARD(new_table_block_iter_nanos);
const bool no_io = (ro.read_tier == kBlockCacheTier);
Cache* block_cache = rep->table_options.block_cache.get();
Cache* block_cache_compressed =
rep->table_options.block_cache_compressed.get();
CachableEntry<Block> block;
BlockHandle handle;
Slice input = index_value;
Status s = handle.DecodeFrom(&input);
if (!s.ok()) {
if (input_iter != nullptr) {
input_iter->SetStatus(s);
return input_iter;
} else {
return NewErrorInternalIterator(s);
}
}
if (block_cache != nullptr || block_cache_compressed != nullptr) {
Statistics* statistics = rep->ioptions.statistics;
char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
Slice key,
ckey ;
if (block_cache != nullptr) {
key = GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size,
handle, cache_key);
}
if (block_cache_compressed != nullptr) {
ckey = GetCacheKey(rep->compressed_cache_key_prefix,
rep->compressed_cache_key_prefix_size, handle,
compressed_cache_key);
}
s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed,
statistics, ro, &block,
rep->table_options.format_version);
if (block.value == nullptr && !no_io && ro.fill_cache) {
std::unique_ptr<Block> raw_block;
{
StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS);
s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle,
&raw_block, rep->ioptions.env,
block_cache_compressed == nullptr);
}
if (s.ok()) {
s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed,
ro, statistics, &block, raw_block.release(),
rep->table_options.format_version);
}
}
}
if (block.value == nullptr) {
if (no_io) {
if (input_iter != nullptr) {
input_iter->SetStatus(Status::Incomplete("no blocking io"));
return input_iter;
} else {
return NewErrorInternalIterator(Status::Incomplete("no blocking io"));
}
}
std::unique_ptr<Block> block_value;
s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle,
&block_value, rep->ioptions.env);
if (s.ok()) {
block.value = block_value.release();
}
}
InternalIterator* iter;
if (block.value != nullptr) {
iter = block.value->NewIterator(&rep->internal_comparator, input_iter);
if (block.cache_handle != nullptr) {
iter->RegisterCleanup(&ReleaseCachedEntry, block_cache,
block.cache_handle);
} else {
iter->RegisterCleanup(&DeleteHeldResource<Block>, block.value, nullptr);
}
} else {
if (input_iter != nullptr) {
input_iter->SetStatus(s);
iter = input_iter;
} else {
iter = NewErrorInternalIterator(s);
}
}
return iter;
}
class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState {
public:
BlockEntryIteratorState(BlockBasedTable* table,
const ReadOptions& read_options, bool skip_filters)
: TwoLevelIteratorState(table->rep_->ioptions.prefix_extractor !=
nullptr),
table_(table),
read_options_(read_options),
skip_filters_(skip_filters) {}
InternalIterator* NewSecondaryIterator(const Slice& index_value) override {
return NewDataBlockIterator(table_->rep_, read_options_, index_value);
}
bool PrefixMayMatch(const Slice& internal_key) override {
if (read_options_.total_order_seek || skip_filters_) {
return true;
}
return table_->PrefixMayMatch(internal_key);
}
private:
BlockBasedTable* table_;
const ReadOptions read_options_;
bool skip_filters_;
};
bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) {
if (!rep_->filter_policy) {
return true;
}
assert(rep_->ioptions.prefix_extractor != nullptr);
auto user_key = ExtractUserKey(internal_key);
if (!rep_->ioptions.prefix_extractor->InDomain(user_key)) {
return true;
}
auto prefix = rep_->ioptions.prefix_extractor->Transform(user_key);
InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue);
auto internal_prefix = internal_key_prefix.Encode();
bool may_match = true;
Status s;
ReadOptions no_io_read_options;
no_io_read_options.read_tier = kBlockCacheTier;
auto filter_entry = GetFilter(true );
FilterBlockReader* filter = filter_entry.value;
if (filter != nullptr && !filter->IsBlockBased()) {
may_match = filter->PrefixMayMatch(prefix);
}
if (may_match) {
unique_ptr<InternalIterator> iiter(NewIndexIterator(no_io_read_options));
iiter->Seek(internal_prefix);
if (!iiter->Valid()) {
may_match = iiter->status().IsIncomplete();
} else if (ExtractUserKey(iiter->key()).starts_with(
ExtractUserKey(internal_prefix))) {
may_match = true;
} else if (filter != nullptr && filter->IsBlockBased()) {
Slice handle_value = iiter->value();
BlockHandle handle;
s = handle.DecodeFrom(&handle_value);
assert(s.ok());
may_match = filter->PrefixMayMatch(prefix, handle.offset());
}
}
Statistics* statistics = rep_->ioptions.statistics;
RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED);
if (!may_match) {
RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL);
}
filter_entry.Release(rep_->table_options.block_cache.get());
return may_match;
}
InternalIterator* BlockBasedTable::NewIterator(const ReadOptions& read_options,
Arena* arena,
bool skip_filters) {
return NewTwoLevelIterator(
new BlockEntryIteratorState(this, read_options, skip_filters),
NewIndexIterator(read_options), arena);
}
bool BlockBasedTable::FullFilterKeyMayMatch(FilterBlockReader* filter,
const Slice& internal_key) const {
if (filter == nullptr || filter->IsBlockBased()) {
return true;
}
Slice user_key = ExtractUserKey(internal_key);
if (!filter->KeyMayMatch(user_key)) {
return false;
}
if (rep_->ioptions.prefix_extractor &&
rep_->ioptions.prefix_extractor->InDomain(user_key) &&
!filter->PrefixMayMatch(
rep_->ioptions.prefix_extractor->Transform(user_key))) {
return false;
}
return true;
}
Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key,
GetContext* get_context, bool skip_filters) {
Status s;
CachableEntry<FilterBlockReader> filter_entry;
if (!skip_filters) {
filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier);
}
FilterBlockReader* filter = filter_entry.value;
if (!FullFilterKeyMayMatch(filter, key)) {
RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
} else {
BlockIter iiter;
NewIndexIterator(read_options, &iiter);
bool done = false;
for (iiter.Seek(key); iiter.Valid() && !done; iiter.Next()) {
Slice handle_value = iiter.value();
BlockHandle handle;
bool not_exist_in_filter =
filter != nullptr && filter->IsBlockBased() == true &&
handle.DecodeFrom(&handle_value).ok() &&
!filter->KeyMayMatch(ExtractUserKey(key), handle.offset());
if (not_exist_in_filter) {
RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL);
break;
} else {
BlockIter biter;
NewDataBlockIterator(rep_, read_options, iiter.value(), &biter);
if (read_options.read_tier == kBlockCacheTier &&
biter.status().IsIncomplete()) {
get_context->MarkKeyMayExist();
break;
}
if (!biter.status().ok()) {
s = biter.status();
break;
}
for (biter.Seek(key); biter.Valid(); biter.Next()) {
ParsedInternalKey parsed_key;
if (!ParseInternalKey(biter.key(), &parsed_key)) {
s = Status::Corruption(Slice());
}
if (!get_context->SaveValue(parsed_key, biter.value())) {
done = true;
break;
}
}
s = biter.status();
}
}
if (s.ok()) {
s = iiter.status();
}
}
filter_entry.Release(rep_->table_options.block_cache.get());
return s;
}
Status BlockBasedTable::Prefetch(const Slice* const begin,
const Slice* const end) {
auto& comparator = rep_->internal_comparator;
if (begin && end && comparator.Compare(*begin, *end) > 0) {
return Status::InvalidArgument(*begin, *end);
}
BlockIter iiter;
NewIndexIterator(ReadOptions(), &iiter);
if (!iiter.status().ok()) {
return iiter.status();
}
bool prefetching_boundary_page = false;
for (begin ? iiter.Seek(*begin) : iiter.SeekToFirst(); iiter.Valid();
iiter.Next()) {
Slice block_handle = iiter.value();
if (end && comparator.Compare(iiter.key(), *end) >= 0) {
if (prefetching_boundary_page) {
break;
}
prefetching_boundary_page = true;
}
BlockIter biter;
NewDataBlockIterator(rep_, ReadOptions(), block_handle, &biter);
if (!biter.status().ok()) {
return biter.status();
}
}
return Status::OK();
}
bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options,
const Slice& key) {
std::unique_ptr<InternalIterator> iiter(NewIndexIterator(options));
iiter->Seek(key);
assert(iiter->Valid());
CachableEntry<Block> block;
BlockHandle handle;
Slice input = iiter->value();
Status s = handle.DecodeFrom(&input);
assert(s.ok());
Cache* block_cache = rep_->table_options.block_cache.get();
assert(block_cache != nullptr);
char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length];
Slice cache_key =
GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size,
handle, cache_key_storage);
Slice ckey;
s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, nullptr,
options, &block,
rep_->table_options.format_version);
assert(s.ok());
bool in_cache = block.value != nullptr;
if (in_cache) {
ReleaseCachedEntry(block_cache, block.cache_handle);
}
return in_cache;
}
Status BlockBasedTable::CreateIndexReader(
IndexReader** index_reader, InternalIterator* preloaded_meta_index_iter) {
auto index_type_on_file = BlockBasedTableOptions::kBinarySearch;
if (rep_->table_properties) {
auto& props = rep_->table_properties->user_collected_properties;
auto pos = props.find(BlockBasedTablePropertyNames::kIndexType);
if (pos != props.end()) {
index_type_on_file = static_cast<BlockBasedTableOptions::IndexType>(
DecodeFixed32(pos->second.c_str()));
}
}
auto file = rep_->file.get();
auto env = rep_->ioptions.env;
auto comparator = &rep_->internal_comparator;
const Footer& footer = rep_->footer;
if (index_type_on_file == BlockBasedTableOptions::kHashSearch &&
rep_->ioptions.prefix_extractor == nullptr) {
Log(InfoLogLevel::WARN_LEVEL, rep_->ioptions.info_log,
"BlockBasedTableOptions::kHashSearch requires "
"options.prefix_extractor to be set."
" Fall back to binary search index.");
index_type_on_file = BlockBasedTableOptions::kBinarySearch;
}
switch (index_type_on_file) {
case BlockBasedTableOptions::kBinarySearch: {
return BinarySearchIndexReader::Create(
file, footer, footer.index_handle(), env, comparator, index_reader);
}
case BlockBasedTableOptions::kHashSearch: {
std::unique_ptr<Block> meta_guard;
std::unique_ptr<InternalIterator> meta_iter_guard;
auto meta_index_iter = preloaded_meta_index_iter;
if (meta_index_iter == nullptr) {
auto s = ReadMetaBlock(rep_, &meta_guard, &meta_iter_guard);
if (!s.ok()) {
Log(InfoLogLevel::WARN_LEVEL, rep_->ioptions.info_log,
"Unable to read the metaindex block."
" Fall back to binary search index.");
return BinarySearchIndexReader::Create(
file, footer, footer.index_handle(), env, comparator, index_reader);
}
meta_index_iter = meta_iter_guard.get();
}
rep_->internal_prefix_transform.reset(
new InternalKeySliceTransform(rep_->ioptions.prefix_extractor));
return HashIndexReader::Create(
rep_->internal_prefix_transform.get(), footer, file, env, comparator,
footer.index_handle(), meta_index_iter, index_reader,
rep_->hash_index_allow_collision);
}
default: {
std::string error_message =
"Unrecognized index type: " + ToString(rep_->index_type);
return Status::InvalidArgument(error_message.c_str());
}
}
}
uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key) {
unique_ptr<InternalIterator> index_iter(NewIndexIterator(ReadOptions()));
index_iter->Seek(key);
uint64_t result;
if (index_iter->Valid()) {
BlockHandle handle;
Slice input = index_iter->value();
Status s = handle.DecodeFrom(&input);
if (s.ok()) {
result = handle.offset();
} else {
result = rep_->footer.metaindex_handle().offset();
}
} else {
result = 0;
if (rep_->table_properties) {
result = rep_->table_properties->data_size;
}
if (result == 0) {
result = rep_->footer.metaindex_handle().offset();
}
}
return result;
}
bool BlockBasedTable::TEST_filter_block_preloaded() const {
return rep_->filter != nullptr;
}
bool BlockBasedTable::TEST_index_reader_preloaded() const {
return rep_->index_reader != nullptr;
}
Status BlockBasedTable::DumpTable(WritableFile* out_file) {
out_file->Append(
"Footer Details:\n"
"--------------------------------------\n"
" ");
out_file->Append(rep_->footer.ToString().c_str());
out_file->Append("\n");
out_file->Append(
"Metaindex Details:\n"
"--------------------------------------\n");
std::unique_ptr<Block> meta;
std::unique_ptr<InternalIterator> meta_iter;
Status s = ReadMetaBlock(rep_, &meta, &meta_iter);
if (s.ok()) {
for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) {
s = meta_iter->status();
if (!s.ok()) {
return s;
}
if (meta_iter->key() == rocksdb::kPropertiesBlock) {
out_file->Append(" Properties block handle: ");
out_file->Append(meta_iter->value().ToString(true).c_str());
out_file->Append("\n");
} else if (strstr(meta_iter->key().ToString().c_str(),
"filter.rocksdb.") != nullptr) {
out_file->Append(" Filter block handle: ");
out_file->Append(meta_iter->value().ToString(true).c_str());
out_file->Append("\n");
}
}
out_file->Append("\n");
} else {
return s;
}
const rocksdb::TableProperties* table_properties;
table_properties = rep_->table_properties.get();
if (table_properties != nullptr) {
out_file->Append(
"Table Properties:\n"
"--------------------------------------\n"
" ");
out_file->Append(table_properties->ToString("\n ", ": ").c_str());
out_file->Append("\n");
}
if (!rep_->filter && !table_properties->filter_policy_name.empty()) {
rocksdb::BlockBasedTableOptions table_options;
table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1));
if (table_properties->filter_policy_name.compare(
table_options.filter_policy->Name()) == 0) {
std::string filter_block_key = kFilterBlockPrefix;
filter_block_key.append(table_properties->filter_policy_name);
BlockHandle handle;
if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) {
BlockContents block;
if (ReadBlockContents(rep_->file.get(), rep_->footer, ReadOptions(),
handle, &block, rep_->ioptions.env, false).ok()) {
rep_->filter.reset(new BlockBasedFilterBlockReader(
rep_->ioptions.prefix_extractor, table_options,
table_options.whole_key_filtering, std::move(block)));
}
}
}
}
if (rep_->filter) {
out_file->Append(
"Filter Details:\n"
"--------------------------------------\n"
" ");
out_file->Append(rep_->filter->ToString().c_str());
out_file->Append("\n");
}
s = DumpIndexBlock(out_file);
if (!s.ok()) {
return s;
}
s = DumpDataBlocks(out_file);
return s;
}
Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) {
out_file->Append(
"Index Details:\n"
"--------------------------------------\n");
std::unique_ptr<InternalIterator> blockhandles_iter(
NewIndexIterator(ReadOptions()));
Status s = blockhandles_iter->status();
if (!s.ok()) {
out_file->Append("Can not read Index Block \n\n");
return s;
}
out_file->Append(" Block key hex dump: Data block handle\n");
out_file->Append(" Block key ascii\n\n");
for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
blockhandles_iter->Next()) {
s = blockhandles_iter->status();
if (!s.ok()) {
break;
}
Slice key = blockhandles_iter->key();
InternalKey ikey;
ikey.DecodeFrom(key);
out_file->Append(" HEX ");
out_file->Append(ikey.user_key().ToString(true).c_str());
out_file->Append(": ");
out_file->Append(blockhandles_iter->value().ToString(true).c_str());
out_file->Append("\n");
std::string str_key = ikey.user_key().ToString();
std::string res_key("");
char cspace = ' ';
for (size_t i = 0; i < str_key.size(); i++) {
res_key.append(&str_key[i], 1);
res_key.append(1, cspace);
}
out_file->Append(" ASCII ");
out_file->Append(res_key.c_str());
out_file->Append("\n ------\n");
}
out_file->Append("\n");
return Status::OK();
}
Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) {
std::unique_ptr<InternalIterator> blockhandles_iter(
NewIndexIterator(ReadOptions()));
Status s = blockhandles_iter->status();
if (!s.ok()) {
out_file->Append("Can not read Index Block \n\n");
return s;
}
size_t block_id = 1;
for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid();
block_id++, blockhandles_iter->Next()) {
s = blockhandles_iter->status();
if (!s.ok()) {
break;
}
out_file->Append("Data Block # ");
out_file->Append(rocksdb::ToString(block_id));
out_file->Append(" @ ");
out_file->Append(blockhandles_iter->value().ToString(true).c_str());
out_file->Append("\n");
out_file->Append("--------------------------------------\n");
std::unique_ptr<InternalIterator> datablock_iter;
datablock_iter.reset(
NewDataBlockIterator(rep_, ReadOptions(), blockhandles_iter->value()));
s = datablock_iter->status();
if (!s.ok()) {
out_file->Append("Error reading the block - Skipped \n\n");
continue;
}
for (datablock_iter->SeekToFirst(); datablock_iter->Valid();
datablock_iter->Next()) {
s = datablock_iter->status();
if (!s.ok()) {
out_file->Append("Error reading the block - Skipped \n");
break;
}
Slice key = datablock_iter->key();
Slice value = datablock_iter->value();
InternalKey ikey, iValue;
ikey.DecodeFrom(key);
iValue.DecodeFrom(value);
out_file->Append(" HEX ");
out_file->Append(ikey.user_key().ToString(true).c_str());
out_file->Append(": ");
out_file->Append(iValue.user_key().ToString(true).c_str());
out_file->Append("\n");
std::string str_key = ikey.user_key().ToString();
std::string str_value = iValue.user_key().ToString();
std::string res_key(""), res_value("");
char cspace = ' ';
for (size_t i = 0; i < str_key.size(); i++) {
res_key.append(&str_key[i], 1);
res_key.append(1, cspace);
}
for (size_t i = 0; i < str_value.size(); i++) {
res_value.append(&str_value[i], 1);
res_value.append(1, cspace);
}
out_file->Append(" ASCII ");
out_file->Append(res_key.c_str());
out_file->Append(": ");
out_file->Append(res_value.c_str());
out_file->Append("\n ------\n");
}
out_file->Append("\n");
}
return Status::OK();
}
}