#include "table/format.h"
#include <cinttypes>
#include <cstdint>
#include <string>
#include "block_fetcher.h"
#include "file/random_access_file_reader.h"
#include "memory/memory_allocator_impl.h"
#include "monitoring/perf_context_imp.h"
#include "monitoring/statistics_impl.h"
#include "options/options_helper.h"
#include "port/likely.h"
#include "rocksdb/env.h"
#include "rocksdb/options.h"
#include "rocksdb/table.h"
#include "table/block_based/block.h"
#include "table/block_based/block_based_table_reader.h"
#include "table/persistent_cache_helper.h"
#include "unique_id_impl.h"
#include "util/cast_util.h"
#include "util/coding.h"
#include "util/compression.h"
#include "util/crc32c.h"
#include "util/hash.h"
#include "util/stop_watch.h"
#include "util/string_util.h"
#include "util/xxhash.h"
namespace ROCKSDB_NAMESPACE {
const char* kHostnameForDbHostId = "__hostname__";
bool ShouldReportDetailedTime(Env* env, Statistics* stats) {
return env != nullptr && stats != nullptr &&
stats->get_stats_level() > kExceptDetailedTimers;
}
void BlockHandle::EncodeTo(std::string* dst) const {
assert(offset_ != ~uint64_t{0});
assert(size_ != ~uint64_t{0});
PutVarint64Varint64(dst, offset_, size_);
}
char* BlockHandle::EncodeTo(char* dst) const {
assert(offset_ != ~uint64_t{0});
assert(size_ != ~uint64_t{0});
char* cur = EncodeVarint64(dst, offset_);
cur = EncodeVarint64(cur, size_);
return cur;
}
Status BlockHandle::DecodeFrom(Slice* input) {
if (GetVarint64(input, &offset_) && GetVarint64(input, &size_)) {
return Status::OK();
} else {
offset_ = 0;
size_ = 0;
return Status::Corruption("bad block handle");
}
}
Status BlockHandle::DecodeSizeFrom(uint64_t _offset, Slice* input) {
if (GetVarint64(input, &size_)) {
offset_ = _offset;
return Status::OK();
} else {
offset_ = 0;
size_ = 0;
return Status::Corruption("bad block handle");
}
}
std::string BlockHandle::ToString(bool hex) const {
std::string handle_str;
EncodeTo(&handle_str);
if (hex) {
return Slice(handle_str).ToString(true);
} else {
return handle_str;
}
}
const BlockHandle BlockHandle::kNullBlockHandle(0, 0);
void IndexValue::EncodeTo(std::string* dst, bool have_first_key,
const BlockHandle* previous_handle) const {
if (previous_handle) {
assert(handle.offset() == previous_handle->offset() +
previous_handle->size() +
BlockBasedTable::kBlockTrailerSize);
PutVarsignedint64(dst, handle.size() - previous_handle->size());
} else {
handle.EncodeTo(dst);
}
assert(dst->size() != 0);
if (have_first_key) {
PutLengthPrefixedSlice(dst, first_internal_key);
}
}
Status IndexValue::DecodeFrom(Slice* input, bool have_first_key,
const BlockHandle* previous_handle) {
if (previous_handle) {
int64_t delta;
if (!GetVarsignedint64(input, &delta)) {
return Status::Corruption("bad delta-encoded index value");
}
handle = BlockHandle(previous_handle->offset() + previous_handle->size() +
BlockBasedTable::kBlockTrailerSize,
previous_handle->size() + delta);
} else {
Status s = handle.DecodeFrom(input);
if (!s.ok()) {
return s;
}
}
if (!have_first_key) {
first_internal_key = Slice();
} else if (!GetLengthPrefixedSlice(input, &first_internal_key)) {
return Status::Corruption("bad first key in block info");
}
return Status::OK();
}
std::string IndexValue::ToString(bool hex, bool have_first_key) const {
std::string s;
EncodeTo(&s, have_first_key, nullptr);
if (hex) {
return Slice(s).ToString(true);
} else {
return s;
}
}
namespace {
inline bool IsLegacyFooterFormat(uint64_t magic_number) {
return magic_number == kLegacyPlainTableMagicNumber;
}
inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) {
if (magic_number == kLegacyPlainTableMagicNumber) {
return kPlainTableMagicNumber;
}
assert(false);
return magic_number;
}
inline uint64_t DownconvertToLegacyFooterFormat(uint64_t magic_number) {
if (magic_number == kPlainTableMagicNumber) {
return kLegacyPlainTableMagicNumber;
}
assert(false);
return magic_number;
}
inline uint8_t BlockTrailerSizeForMagicNumber(uint64_t magic_number) {
if (magic_number == kBlockBasedTableMagicNumber) {
return static_cast<uint8_t>(BlockBasedTable::kBlockTrailerSize);
} else {
return 0;
}
}
const std::array<char, 4> kExtendedMagic{{0x3e, 0x00, 0x7a, 0x00}};
constexpr size_t kFooterPart2Size = 2 * BlockHandle::kMaxEncodedLength;
}
Status FooterBuilder::Build(uint64_t magic_number, uint32_t format_version,
uint64_t footer_offset, ChecksumType checksum_type,
const BlockHandle& metaindex_handle,
const BlockHandle& index_handle,
uint32_t base_context_checksum) {
assert(magic_number != Footer::kNullTableMagicNumber);
assert(IsSupportedFormatVersionForWrite(magic_number, format_version) ||
TEST_AllowUnsupportedFormatVersion());
char* part2;
char* part3;
if (format_version > 0) {
slice_ = Slice(data_.data(), Footer::kNewVersionsEncodedLength);
char* cur = data_.data();
*(cur++) = checksum_type;
part2 = cur;
cur += kFooterPart2Size;
part3 = cur;
EncodeFixed32(cur, format_version);
cur += 4;
EncodeFixed64(cur, magic_number);
assert(cur + 8 == slice_.data() + slice_.size());
} else {
slice_ = Slice(data_.data(), Footer::kVersion0EncodedLength);
assert(checksum_type == kNoChecksum || checksum_type == kCRC32c);
part2 = data_.data();
part3 = part2 + kFooterPart2Size;
char* cur = part3;
EncodeFixed64(cur, DownconvertToLegacyFooterFormat(magic_number));
assert(cur + 8 == slice_.data() + slice_.size());
}
if (format_version >= 6) {
if (BlockTrailerSizeForMagicNumber(magic_number) != 0) {
assert(base_context_checksum != 0);
assert(ChecksumModifierForContext(base_context_checksum, 0) != 0);
} else {
assert(base_context_checksum == 0);
assert(ChecksumModifierForContext(base_context_checksum, 0) == 0);
}
char* cur = data_.data() + 1;
std::copy(kExtendedMagic.begin(), kExtendedMagic.end(), cur);
cur += kExtendedMagic.size();
char* checksum_data = cur;
EncodeFixed32(cur, 0);
cur += 4;
EncodeFixed32(cur, base_context_checksum);
cur += 4;
uint32_t metaindex_size = static_cast<uint32_t>(metaindex_handle.size());
if (metaindex_size != metaindex_handle.size()) {
return Status::NotSupported("Metaindex block size > 4GB");
}
assert(metaindex_size == 0 ||
metaindex_handle.offset() + metaindex_handle.size() ==
footer_offset - BlockTrailerSizeForMagicNumber(magic_number));
EncodeFixed32(cur, metaindex_size);
cur += 4;
std::fill_n(cur, 24U, char{0});
assert(cur + 24 == part3);
uint32_t checksum = ComputeBuiltinChecksum(
checksum_type, data_.data(), Footer::kNewVersionsEncodedLength);
checksum +=
ChecksumModifierForContext(base_context_checksum, footer_offset);
EncodeFixed32(checksum_data, checksum);
} else {
assert(!FormatVersionUsesContextChecksum(format_version));
assert(base_context_checksum == 0);
assert(ChecksumModifierForContext(base_context_checksum, 0) == 0);
char* cur = part2;
cur = metaindex_handle.EncodeTo(cur);
cur = index_handle.EncodeTo(cur);
std::fill(cur, part3, char{0});
}
return Status::OK();
}
Status Footer::DecodeFrom(Slice input, uint64_t input_offset,
uint64_t enforce_table_magic_number) {
assert(table_magic_number_ == kNullTableMagicNumber);
assert(input != nullptr);
assert(input.size() >= kMinEncodedLength);
const char* magic_ptr = input.data() + input.size() - kMagicNumberLengthByte;
uint64_t magic = DecodeFixed64(magic_ptr);
if (magic == 0xdb4775248b80fb57ull) {
return Status::NotSupported(
"Unsupported legacy magic number for block-based SST format. Load with "
"RocksDB >= 4.6.0 and < 11.0.0 and run full compaction to upgrade.");
}
bool legacy = IsLegacyFooterFormat(magic);
if (legacy) {
magic = UpconvertLegacyFooterFormat(magic);
}
if (enforce_table_magic_number != 0 && enforce_table_magic_number != magic) {
return Status::Corruption("Bad table magic number: expected " +
std::to_string(enforce_table_magic_number) +
", found " + std::to_string(magic));
}
table_magic_number_ = magic;
block_trailer_size_ = BlockTrailerSizeForMagicNumber(magic);
const char* part3_ptr = magic_ptr;
uint32_t computed_checksum = 0;
uint64_t footer_offset = 0;
if (legacy) {
input.remove_prefix(input.size() - kVersion0EncodedLength);
format_version_ = 0 ;
checksum_type_ = kCRC32c;
} else {
part3_ptr = magic_ptr - 4;
format_version_ = DecodeFixed32(part3_ptr);
if (UNLIKELY(!IsSupportedFormatVersionForRead(magic, format_version_) &&
!TEST_AllowUnsupportedFormatVersion())) {
return Status::Corruption("Corrupt or unsupported format_version " +
std::to_string(format_version_) +
" for magic " + std::to_string(magic));
}
if (UNLIKELY(input.size() < kNewVersionsEncodedLength)) {
return Status::Corruption("Input is too short to be an SST file");
}
uint64_t adjustment = input.size() - kNewVersionsEncodedLength;
input.remove_prefix(adjustment);
footer_offset = input_offset + adjustment;
char chksum = input.data()[0];
checksum_type_ = lossless_cast<ChecksumType>(chksum);
if (UNLIKELY(!IsSupportedChecksumType(checksum_type()))) {
return Status::Corruption("Corrupt or unsupported checksum type: " +
std::to_string(lossless_cast<uint8_t>(chksum)));
}
if (checksum_type_ != kNoChecksum && format_version_ >= 6) {
std::array<char, kNewVersionsEncodedLength> copy_without_checksum;
std::copy_n(input.data(), kNewVersionsEncodedLength,
copy_without_checksum.data());
EncodeFixed32(©_without_checksum[5], 0); computed_checksum =
ComputeBuiltinChecksum(checksum_type(), copy_without_checksum.data(),
kNewVersionsEncodedLength);
}
input.remove_prefix(1);
}
if (format_version_ >= 6) {
Slice ext_magic(input.data(), 4);
if (UNLIKELY(ext_magic.compare(Slice(kExtendedMagic.data(),
kExtendedMagic.size())) != 0)) {
return Status::Corruption("Bad extended magic number: 0x" +
ext_magic.ToString( true));
}
input.remove_prefix(4);
uint32_t stored_checksum = 0, metaindex_size = 0;
bool success;
success = GetFixed32(&input, &stored_checksum);
assert(success);
success = GetFixed32(&input, &base_context_checksum_);
assert(success);
if (UNLIKELY(ChecksumModifierForContext(base_context_checksum_, 0) == 0)) {
return Status::Corruption("Invalid base context checksum");
}
computed_checksum +=
ChecksumModifierForContext(base_context_checksum_, footer_offset);
if (UNLIKELY(computed_checksum != stored_checksum)) {
return Status::Corruption("Footer at " + std::to_string(footer_offset) +
" checksum mismatch");
}
success = GetFixed32(&input, &metaindex_size);
assert(success);
(void)success;
uint64_t metaindex_end = footer_offset - GetBlockTrailerSize();
metaindex_handle_ =
BlockHandle(metaindex_end - metaindex_size, metaindex_size);
index_handle_ = BlockHandle::NullBlockHandle();
input.remove_prefix(16U);
uint64_t reserved = 0;
success = GetFixed64(&input, &reserved);
assert(success);
if (UNLIKELY(reserved != 0)) {
return Status::NotSupported(
"File uses a future feature not supported in this version");
}
assert(input.data() == part3_ptr);
} else {
Status result = metaindex_handle_.DecodeFrom(&input);
if (result.ok()) {
result = index_handle_.DecodeFrom(&input);
}
if (!result.ok()) {
return result;
}
}
return Status::OK();
}
std::string Footer::ToString() const {
std::string result;
result.reserve(1024);
result.append("metaindex handle: " + metaindex_handle_.ToString() +
" offset: " + std::to_string(metaindex_handle_.offset()) +
" size: " + std::to_string(metaindex_handle_.size()) + "\n ");
result.append("index handle: " + index_handle_.ToString() +
" offset: " + std::to_string(index_handle_.offset()) +
" size: " + std::to_string(index_handle_.size()) + "\n ");
result.append("table_magic_number: " + std::to_string(table_magic_number_) +
"\n ");
if (!IsLegacyFooterFormat(table_magic_number_)) {
result.append("format version: " + std::to_string(format_version_) + "\n");
}
return result;
}
bool& TEST_AllowUnsupportedFormatVersion() {
static bool allow = false;
return allow;
}
static Status ReadFooterFromFileInternal(
const IOOptions& opts, RandomAccessFileReader* file, FileSystem& fs,
FilePrefetchBuffer* prefetch_buffer, uint64_t expected_file_size,
Footer* footer, uint64_t enforce_table_magic_number) {
uint64_t file_size_from_file_system = 0;
Status s;
s = file->file()->GetFileSize(&file_size_from_file_system);
if (!s.ok()) {
s = fs.GetFileSize(file->file_name(), IOOptions(),
&file_size_from_file_system, nullptr);
if (!s.ok()) {
return s;
}
}
if (expected_file_size != file_size_from_file_system) {
return Status::Corruption("Sst file size mismatch between expected " +
std::to_string(expected_file_size) +
" and file system " +
std::to_string(file_size_from_file_system) +
" sstable: " + file->file_name());
}
if (expected_file_size < Footer::kMinEncodedLength) {
return Status::Corruption("file is too short (" +
std::to_string(expected_file_size) +
" bytes) to be an "
"sstable: " +
file->file_name());
}
std::array<char, Footer::kMaxEncodedLength + 1> footer_buf;
AlignedBuf internal_buf;
Slice footer_input;
uint64_t read_offset = (expected_file_size > Footer::kMaxEncodedLength)
? expected_file_size - Footer::kMaxEncodedLength
: 0;
if (prefetch_buffer == nullptr ||
!prefetch_buffer->TryReadFromCache(opts, file, read_offset,
Footer::kMaxEncodedLength,
&footer_input, nullptr)) {
if (file->use_direct_io()) {
s = file->Read(opts, read_offset, Footer::kMaxEncodedLength,
&footer_input, nullptr, &internal_buf);
} else {
s = file->Read(opts, read_offset, Footer::kMaxEncodedLength,
&footer_input, footer_buf.data(), nullptr);
}
if (!s.ok()) {
return s;
}
}
TEST_SYNC_POINT_CALLBACK("ReadFooterFromFileInternal:0", &footer_input);
if (footer_input.size() < Footer::kMinEncodedLength) {
return Status::Corruption(
"The number of bytes read for Footer input " +
std::to_string(footer_input.size()) +
" is smaller than minimum footer encoded length: " +
std::to_string(Footer::kMinEncodedLength) + " for file " +
file->file_name() + "\n");
}
s = footer->DecodeFrom(footer_input, read_offset, enforce_table_magic_number);
if (!s.ok()) {
s = Status::CopyAppendMessage(s, " in ", file->file_name());
return s;
}
return Status::OK();
}
Status ReadFooterFromFile(const IOOptions& opts, RandomAccessFileReader* file,
FileSystem& fs, FilePrefetchBuffer* prefetch_buffer,
uint64_t expected_file_size, Footer* footer,
uint64_t enforce_table_magic_number,
Statistics* stats) {
Status s = ReadFooterFromFileInternal(opts, file, fs, prefetch_buffer,
expected_file_size, footer,
enforce_table_magic_number);
if (s.IsCorruption() &&
CheckFSFeatureSupport(&fs, FSSupportedOps::kVerifyAndReconstructRead)) {
IOOptions new_opts = opts;
new_opts.verify_and_reconstruct_read = true;
footer->Reset();
s = ReadFooterFromFileInternal(new_opts, file, fs,
nullptr,
expected_file_size, footer,
enforce_table_magic_number);
RecordTick(stats, FILE_READ_CORRUPTION_RETRY_COUNT);
if (s.ok()) {
RecordTick(stats, FILE_READ_CORRUPTION_RETRY_SUCCESS_COUNT);
}
}
return s;
}
namespace {
inline uint32_t ModifyChecksumForLastByte(uint32_t checksum, char last_byte) {
const uint32_t kRandomPrime = 0x6b9083d9;
return checksum ^ lossless_cast<uint8_t>(last_byte) * kRandomPrime;
}
}
uint32_t ComputeBuiltinChecksum(ChecksumType type, const char* data,
size_t data_size) {
switch (type) {
case kCRC32c:
return crc32c::Mask(crc32c::Value(data, data_size));
case kxxHash:
return XXH32(data, data_size, 0);
case kxxHash64:
return Lower32of64(XXH64(data, data_size, 0));
case kXXH3: {
if (data_size == 0) {
return 0;
} else {
uint32_t v = Lower32of64(XXH3_64bits(data, data_size - 1));
return ModifyChecksumForLastByte(v, data[data_size - 1]);
}
}
default: return 0;
}
}
uint32_t ComputeBuiltinChecksumWithLastByte(ChecksumType type, const char* data,
size_t data_size, char last_byte) {
switch (type) {
case kCRC32c: {
uint32_t crc = crc32c::Value(data, data_size);
crc = crc32c::Extend(crc, &last_byte, 1);
return crc32c::Mask(crc);
}
case kxxHash: {
XXH32_state_t* const state = XXH32_createState();
XXH32_reset(state, 0);
XXH32_update(state, data, data_size);
XXH32_update(state, &last_byte, 1);
uint32_t v = XXH32_digest(state);
XXH32_freeState(state);
return v;
}
case kxxHash64: {
XXH64_state_t* const state = XXH64_createState();
XXH64_reset(state, 0);
XXH64_update(state, data, data_size);
XXH64_update(state, &last_byte, 1);
uint32_t v = Lower32of64(XXH64_digest(state));
XXH64_freeState(state);
return v;
}
case kXXH3: {
uint32_t v = Lower32of64(XXH3_64bits(data, data_size));
return ModifyChecksumForLastByte(v, last_byte);
}
default: return 0;
}
}
Status DecompressBlockData(Decompressor::Args& args, Decompressor& decompressor,
BlockContents* out_contents,
const ImmutableOptions& ioptions,
MemoryAllocator* allocator) {
assert(args.compression_type != kNoCompression && "Invalid compression type");
StopWatchNano timer(ioptions.clock,
ShouldReportDetailedTime(ioptions.env, ioptions.stats));
Status s = decompressor.ExtractUncompressedSize(args);
if (UNLIKELY(!s.ok())) {
return s;
}
CacheAllocationPtr ubuf = AllocateBlock(args.uncompressed_size, allocator);
s = decompressor.DecompressBlock(args, ubuf.get());
if (UNLIKELY(!s.ok())) {
return s;
}
*out_contents = BlockContents(std::move(ubuf), args.uncompressed_size);
if (ShouldReportDetailedTime(ioptions.env, ioptions.stats)) {
RecordTimeToHistogram(ioptions.stats, DECOMPRESSION_TIMES_NANOS,
timer.ElapsedNanos());
}
RecordTick(ioptions.stats, BYTES_DECOMPRESSED_FROM,
args.compressed_data.size());
RecordTick(ioptions.stats, BYTES_DECOMPRESSED_TO, out_contents->data.size());
RecordTick(ioptions.stats, NUMBER_BLOCK_DECOMPRESSED);
TEST_SYNC_POINT_CALLBACK("DecompressBlockData:TamperWithReturnValue",
static_cast<void*>(&s));
TEST_SYNC_POINT_CALLBACK("DecompressBlockData:TamperWithDecompressionOutput",
static_cast<void*>(out_contents));
return s;
}
Status DecompressBlockData(const char* data, size_t size, CompressionType type,
Decompressor& decompressor,
BlockContents* out_contents,
const ImmutableOptions& ioptions,
MemoryAllocator* allocator,
Decompressor::ManagedWorkingArea* working_area) {
Decompressor::Args args;
args.compressed_data = Slice(data, size);
args.compression_type = type;
args.working_area = working_area;
return DecompressBlockData(args, decompressor, out_contents, ioptions,
allocator);
}
Status DecompressSerializedBlock(const char* data, size_t size,
CompressionType type,
Decompressor& decompressor,
BlockContents* out_contents,
const ImmutableOptions& ioptions,
MemoryAllocator* allocator) {
assert(data[size] != kNoCompression);
assert(data[size] == static_cast<char>(type));
return DecompressBlockData(data, size, type, decompressor, out_contents,
ioptions, allocator);
}
Status DecompressSerializedBlock(Decompressor::Args& args,
Decompressor& decompressor,
BlockContents* out_contents,
const ImmutableOptions& ioptions,
MemoryAllocator* allocator) {
assert(args.compressed_data.data()[args.compressed_data.size()] !=
kNoCompression);
assert(args.compressed_data.data()[args.compressed_data.size()] ==
static_cast<char>(args.compression_type));
return DecompressBlockData(args, decompressor, out_contents, ioptions,
allocator);
}
Status ReifyDbHostIdProperty(Env* env, std::string* db_host_id) {
assert(db_host_id);
if (*db_host_id == kHostnameForDbHostId) {
Status s = env->GetHostNameString(db_host_id);
if (!s.ok()) {
db_host_id->clear();
}
return s;
}
return Status::OK();
}
}