#include <config.h>
#include "blockhash.h"
#include <stdint.h>
#include <string.h>
#include <algorithm>
#include "compile_assert.h"
#include "logging.h"
#include "rolling_hash.h"
namespace open_vcdiff {
typedef unsigned long uword_t;
BlockHash::BlockHash(const char* source_data,
size_t source_size,
int starting_offset)
: source_data_(source_data),
source_size_(source_size),
hash_table_mask_(0),
starting_offset_(starting_offset),
last_block_added_(-1) {
}
BlockHash::~BlockHash() { }
VCD_COMPILE_ASSERT(BlockHash::kBlockSize >= 2, kBlockSize_must_be_at_least_2);
VCD_COMPILE_ASSERT((BlockHash::kBlockSize & (BlockHash::kBlockSize - 1)) == 0,
kBlockSize_must_be_a_power_of_2);
bool BlockHash::Init(bool populate_hash_table) {
if (!hash_table_.empty() ||
!next_block_table_.empty() ||
!last_block_table_.empty()) {
VCD_DFATAL << "Init() called twice for same BlockHash object" << VCD_ENDL;
return false;
}
const size_t table_size = CalcTableSize(source_size_);
if (table_size == 0) {
VCD_DFATAL << "Error finding table size for source size " << source_size_
<< VCD_ENDL;
return false;
}
hash_table_mask_ = static_cast<uint32_t>(table_size - 1);
hash_table_.resize(table_size, -1);
next_block_table_.resize(GetNumberOfBlocks(), -1);
last_block_table_.resize(GetNumberOfBlocks(), -1);
if (populate_hash_table) {
AddAllBlocks();
}
return true;
}
const BlockHash* BlockHash::CreateDictionaryHash(const char* dictionary_data,
size_t dictionary_size) {
BlockHash* new_dictionary_hash = new BlockHash(dictionary_data,
dictionary_size,
0);
if (!new_dictionary_hash->Init( true)) {
delete new_dictionary_hash;
return NULL;
} else {
return new_dictionary_hash;
}
}
BlockHash* BlockHash::CreateTargetHash(const char* target_data,
size_t target_size,
size_t dictionary_size) {
BlockHash* new_target_hash = new BlockHash(target_data,
target_size,
static_cast<int>(dictionary_size));
if (!new_target_hash->Init( false)) {
delete new_target_hash;
return NULL;
} else {
return new_target_hash;
}
}
size_t BlockHash::CalcTableSize(const size_t dictionary_size) {
const size_t min_size = (dictionary_size / sizeof(int)) + 1; size_t table_size = 1;
while (table_size < min_size) {
table_size <<= 1;
if (table_size <= 0) {
VCD_DFATAL << "Internal error: CalcTableSize(dictionary_size = "
<< dictionary_size
<< "): resulting table_size " << table_size
<< " is zero or negative" << VCD_ENDL;
return 0;
}
}
if ((table_size & (table_size - 1)) != 0) {
VCD_DFATAL << "Internal error: CalcTableSize(dictionary_size = "
<< dictionary_size
<< "): resulting table_size " << table_size
<< " is not a power of 2" << VCD_ENDL;
return 0;
}
if ((dictionary_size > 0) && (table_size > (min_size * 2))) {
VCD_DFATAL << "Internal error: CalcTableSize(dictionary_size = "
<< dictionary_size
<< "): resulting table_size " << table_size
<< " is too large" << VCD_ENDL;
return 0;
}
return table_size;
}
void BlockHash::AddBlock(uint32_t hash_value) {
if (hash_table_.empty()) {
VCD_DFATAL << "BlockHash::AddBlock() called before BlockHash::Init()"
<< VCD_ENDL;
return;
}
int block_number = last_block_added_ + 1;
const int total_blocks =
static_cast<int>(source_size_ / kBlockSize); if (block_number >= total_blocks) {
VCD_DFATAL << "BlockHash::AddBlock() called"
" with block number " << block_number
<< " that is past last block " << (total_blocks - 1)
<< VCD_ENDL;
return;
}
if (next_block_table_[block_number] != -1) {
VCD_DFATAL << "Internal error in BlockHash::AddBlock(): "
"block number = " << block_number
<< ", next block should be -1 but is "
<< next_block_table_[block_number] << VCD_ENDL;
return;
}
const uint32_t hash_table_index = GetHashTableIndex(hash_value);
const int first_matching_block = hash_table_[hash_table_index];
if (first_matching_block < 0) {
hash_table_[hash_table_index] = block_number;
last_block_table_[block_number] = block_number;
} else {
const int last_matching_block = last_block_table_[first_matching_block];
if (next_block_table_[last_matching_block] != -1) {
VCD_DFATAL << "Internal error in BlockHash::AddBlock(): "
"first matching block = " << first_matching_block
<< ", last matching block = " << last_matching_block
<< ", next block should be -1 but is "
<< next_block_table_[last_matching_block] << VCD_ENDL;
return;
}
next_block_table_[last_matching_block] = block_number;
last_block_table_[first_matching_block] = block_number;
}
last_block_added_ = block_number;
}
void BlockHash::AddAllBlocks() {
AddAllBlocksThroughIndex(static_cast<int>(source_size_));
}
void BlockHash::AddAllBlocksThroughIndex(int end_index) {
if (end_index > static_cast<int>(source_size_)) {
VCD_DFATAL << "BlockHash::AddAllBlocksThroughIndex() called"
" with index " << end_index
<< " higher than end index " << source_size_ << VCD_ENDL;
return;
}
const int last_index_added = last_block_added_ * kBlockSize;
if (end_index <= last_index_added) {
VCD_DFATAL << "BlockHash::AddAllBlocksThroughIndex() called"
" with index " << end_index
<< " <= last index added ( " << last_index_added
<< ")" << VCD_ENDL;
return;
}
if (source_size() < static_cast<size_t>(kBlockSize)) {
return;
}
int end_limit = end_index;
int last_legal_hash_index = static_cast<int>(source_size() - kBlockSize);
if (end_limit > last_legal_hash_index) {
end_limit = last_legal_hash_index + 1;
}
const char* block_ptr = source_data() + NextIndexToAdd();
const char* const end_ptr = source_data() + end_limit;
while (block_ptr < end_ptr) {
AddBlock(RollingHash<kBlockSize>::Hash(block_ptr));
block_ptr += kBlockSize;
}
}
VCD_COMPILE_ASSERT((BlockHash::kBlockSize % sizeof(uword_t)) == 0,
kBlockSize_must_be_a_multiple_of_machine_word_size);
template<int number_of_words>
inline bool CompareWholeWordValues(const char* block1,
const char* block2) {
return CompareWholeWordValues<1>(block1, block2) &&
CompareWholeWordValues<number_of_words - 1>(block1 + sizeof(uword_t),
block2 + sizeof(uword_t));
}
template<>
inline bool CompareWholeWordValues<1>(const char* word1,
const char* word2) {
uword_t aligned_word1, aligned_word2;
memcpy(&aligned_word1, word1, sizeof(aligned_word1));
memcpy(&aligned_word2, word2, sizeof(aligned_word2));
return aligned_word1 == aligned_word2;
}
inline bool BlockCompareWordsInline(const char* block1, const char* block2) {
static const size_t kWordsPerBlock = BlockHash::kBlockSize / sizeof(uword_t);
return CompareWholeWordValues<kWordsPerBlock>(block1, block2);
}
bool BlockHash::BlockCompareWords(const char* block1, const char* block2) {
return BlockCompareWordsInline(block1, block2);
}
inline bool BlockContentsMatchInline(const char* block1, const char* block2) {
if (*block1 != *block2) {
return false;
}
#ifdef VCDIFF_USE_BLOCK_COMPARE_WORDS
return BlockCompareWordsInline(block1, block2);
#else
return memcmp(block1, block2, BlockHash::kBlockSize) == 0;
#endif }
bool BlockHash::BlockContentsMatch(const char* block1, const char* block2) {
return BlockContentsMatchInline(block1, block2);
}
inline int BlockHash::SkipNonMatchingBlocks(int block_number,
const char* block_ptr) const {
int probes = 0;
while ((block_number >= 0) &&
!BlockContentsMatchInline(block_ptr,
&source_data_[block_number * kBlockSize])) {
if (++probes > kMaxProbes) {
return -1; }
block_number = next_block_table_[block_number];
}
return block_number;
}
inline int BlockHash::FirstMatchingBlockInline(uint32_t hash_value,
const char* block_ptr) const {
return SkipNonMatchingBlocks(hash_table_[GetHashTableIndex(hash_value)],
block_ptr);
}
int BlockHash::FirstMatchingBlock(uint32_t hash_value,
const char* block_ptr) const {
return FirstMatchingBlockInline(hash_value, block_ptr);
}
int BlockHash::NextMatchingBlock(int block_number,
const char* block_ptr) const {
if (static_cast<size_t>(block_number) >= GetNumberOfBlocks()) {
VCD_DFATAL << "NextMatchingBlock called for invalid block number "
<< block_number << VCD_ENDL;
return -1;
}
return SkipNonMatchingBlocks(next_block_table_[block_number], block_ptr);
}
inline bool BlockHash::TooManyMatches(int* match_counter) {
++(*match_counter);
return (*match_counter) > kMaxMatchesToCheck;
}
int BlockHash::MatchingBytesToLeft(const char* source_match_start,
const char* target_match_start,
int max_bytes) {
const char* source_ptr = source_match_start;
const char* target_ptr = target_match_start;
int bytes_found = 0;
while (bytes_found < max_bytes) {
--source_ptr;
--target_ptr;
if (*source_ptr != *target_ptr) {
break;
}
++bytes_found;
}
return bytes_found;
}
int BlockHash::MatchingBytesToRight(const char* source_match_end,
const char* target_match_end,
int max_bytes) {
const char* source_ptr = source_match_end;
const char* target_ptr = target_match_end;
int bytes_found = 0;
while ((bytes_found < max_bytes) && (*source_ptr == *target_ptr)) {
++bytes_found;
++source_ptr;
++target_ptr;
}
return bytes_found;
}
void BlockHash::FindBestMatch(uint32_t hash_value,
const char* target_candidate_start,
const char* target_start,
size_t target_size,
Match* best_match) const {
int match_counter = 0;
for (int block_number = FirstMatchingBlockInline(hash_value,
target_candidate_start);
(block_number >= 0) && !TooManyMatches(&match_counter);
block_number = NextMatchingBlock(block_number, target_candidate_start)) {
int source_match_offset = block_number * kBlockSize;
const int source_match_end = source_match_offset + kBlockSize;
int target_match_offset =
static_cast<int>(target_candidate_start - target_start);
const int target_match_end = target_match_offset + kBlockSize;
size_t match_size = kBlockSize;
{
const int limit_bytes_to_left = std::min(source_match_offset,
target_match_offset);
const int matching_bytes_to_left =
MatchingBytesToLeft(source_data_ + source_match_offset,
target_start + target_match_offset,
limit_bytes_to_left);
source_match_offset -= matching_bytes_to_left;
target_match_offset -= matching_bytes_to_left;
match_size += matching_bytes_to_left;
}
{
const size_t source_bytes_to_right = source_size_ - source_match_end;
const size_t target_bytes_to_right = target_size - target_match_end;
const size_t limit_bytes_to_right = std::min(source_bytes_to_right,
target_bytes_to_right);
match_size +=
MatchingBytesToRight(source_data_ + source_match_end,
target_start + target_match_end,
static_cast<int>(limit_bytes_to_right));
}
best_match->ReplaceIfBetterMatch(match_size,
source_match_offset + starting_offset_,
target_match_offset);
}
}
}