#include <inttypes.h>
#include <stdio.h>
#include <sys/param.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <stdint.h>
#include <endian.h>
#include "out.h"
#include "uuid.h"
#include "btt.h"
#include "btt_layout.h"
#include "sys_util.h"
#include "util.h"
struct btt {
unsigned nlane;
os_mutex_t layout_write_mutex;
int laidout;
uint8_t uuid[BTTINFO_UUID_LEN];
uint8_t parent_uuid[BTTINFO_UUID_LEN];
uint64_t rawsize;
uint32_t lbasize;
uint32_t nfree;
uint64_t nlba;
unsigned narena;
struct arena {
uint32_t flags;
uint32_t external_nlba;
uint32_t internal_lbasize;
uint32_t internal_nlba;
uint64_t startoff;
uint64_t dataoff;
uint64_t mapoff;
uint64_t flogoff;
uint64_t nextoff;
struct flog_runtime {
struct btt_flog flog;
uint64_t entries[2];
int next;
} *flogs;
uint32_t volatile *rtt;
os_mutex_t *map_locks;
os_mutex_t info_lock;
} *arenas;
void *ns;
const struct ns_callback *ns_cbp;
};
static const char Sig[] = BTTINFO_SIG;
static const struct btt_flog Zflog;
static const unsigned Nseq[] = { 0, 2, 3, 1 };
#define NSEQ(seq) (Nseq[(seq) & 3])
static inline uint32_t
get_map_lock_num(uint32_t premap_lba, uint32_t nfree)
{
return (uint32_t)(premap_lba * BTT_MAP_ENTRY_SIZE / BTT_MAP_LOCK_ALIGN)
% nfree;
}
static int
invalid_lba(struct btt *bttp, uint64_t lba)
{
LOG(3, "bttp %p lba %" PRIu64, bttp, lba);
if (lba >= bttp->nlba) {
ERR("lba out of range (nlba %" PRIu64 ")", bttp->nlba);
errno = EINVAL;
return 1;
}
return 0;
}
static int
read_info(struct btt *bttp, struct btt_info *infop)
{
LOG(3, "infop %p", infop);
if (memcmp(infop->sig, Sig, BTTINFO_SIG_LEN)) {
LOG(3, "signature invalid");
return 0;
}
if (memcmp(infop->parent_uuid, bttp->parent_uuid, BTTINFO_UUID_LEN)) {
LOG(3, "parent UUID mismatch");
return 0;
}
if (!util_checksum(infop, sizeof(*infop), &infop->checksum, 0)) {
LOG(3, "invalid checksum");
return 0;
}
if ((infop->major = le16toh(infop->major)) == 0) {
LOG(3, "invalid major version (0)");
return 0;
}
infop->flags = le32toh(infop->flags);
infop->minor = le16toh(infop->minor);
infop->external_lbasize = le32toh(infop->external_lbasize);
infop->external_nlba = le32toh(infop->external_nlba);
infop->internal_lbasize = le32toh(infop->internal_lbasize);
infop->internal_nlba = le32toh(infop->internal_nlba);
infop->nfree = le32toh(infop->nfree);
infop->infosize = le32toh(infop->infosize);
infop->nextoff = le64toh(infop->nextoff);
infop->dataoff = le64toh(infop->dataoff);
infop->mapoff = le64toh(infop->mapoff);
infop->flogoff = le64toh(infop->flogoff);
infop->infooff = le64toh(infop->infooff);
return 1;
}
static inline int
map_entry_is_zero(uint32_t map_entry)
{
return (map_entry & ~BTT_MAP_ENTRY_LBA_MASK) == BTT_MAP_ENTRY_ZERO;
}
static inline int
map_entry_is_error(uint32_t map_entry)
{
return (map_entry & ~BTT_MAP_ENTRY_LBA_MASK) == BTT_MAP_ENTRY_ERROR;
}
int
map_entry_is_initial(uint32_t map_entry)
{
return (map_entry & ~BTT_MAP_ENTRY_LBA_MASK) == 0;
}
static inline int
map_entry_is_zero_or_initial(uint32_t map_entry)
{
uint32_t entry_flags = map_entry & ~BTT_MAP_ENTRY_LBA_MASK;
return entry_flags == 0 || entry_flags == BTT_MAP_ENTRY_ZERO;
}
struct btt_flog *
btt_flog_get_valid(struct btt_flog *flog_pair, int *next)
{
if (flog_pair[0].seq == flog_pair[1].seq) {
return NULL;
} else if (flog_pair[0].seq == 0) {
*next = 0;
return &flog_pair[1];
} else if (flog_pair[1].seq == 0) {
*next = 1;
return &flog_pair[0];
} else if (NSEQ(flog_pair[0].seq) == flog_pair[1].seq) {
*next = 0;
return &flog_pair[1];
} else {
*next = 1;
return &flog_pair[0];
}
}
static int
read_flog_pair(struct btt *bttp, unsigned lane, struct arena *arenap,
uint64_t flog_off, struct flog_runtime *flog_runtimep, uint32_t flognum)
{
LOG(5, "bttp %p lane %u arenap %p flog_off %" PRIu64 " runtimep %p "
"flognum %u", bttp, lane, arenap, flog_off, flog_runtimep,
flognum);
flog_runtimep->entries[0] = flog_off;
flog_runtimep->entries[1] = flog_off + sizeof(struct btt_flog);
if (lane >= bttp->nfree) {
ERR("invalid lane %u among nfree %d", lane, bttp->nfree);
errno = EINVAL;
return -1;
}
if (flog_off == 0) {
ERR("invalid flog offset %" PRIu64, flog_off);
errno = EINVAL;
return -1;
}
struct btt_flog flog_pair[2];
if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, flog_pair,
sizeof(flog_pair), flog_off) < 0)
return -1;
btt_flog_convert2h(&flog_pair[0]);
if (invalid_lba(bttp, flog_pair[0].lba))
return -1;
btt_flog_convert2h(&flog_pair[1]);
if (invalid_lba(bttp, flog_pair[1].lba))
return -1;
LOG(6, "flog_pair[0] flog_off %" PRIu64 " old_map %u new_map %u seq %u",
flog_off, flog_pair[0].old_map,
flog_pair[0].new_map, flog_pair[0].seq);
LOG(6, "flog_pair[1] old_map %u new_map %u seq %u",
flog_pair[1].old_map, flog_pair[1].new_map,
flog_pair[1].seq);
struct btt_flog *currentp = btt_flog_get_valid(flog_pair,
&flog_runtimep->next);
if (currentp == NULL) {
ERR("flog layout error: bad seq numbers %d %d",
flog_pair[0].seq, flog_pair[1].seq);
arenap->flags |= BTTINFO_FLAG_ERROR;
return 0;
}
LOG(6, "run-time flog next is %d", flog_runtimep->next);
flog_runtimep->flog = *currentp;
LOG(9, "read flog[%u]: lba %u old %u%s%s%s new %u%s%s%s", flognum,
currentp->lba,
currentp->old_map & BTT_MAP_ENTRY_LBA_MASK,
(map_entry_is_error(currentp->old_map)) ? " ERROR" : "",
(map_entry_is_zero(currentp->old_map)) ? " ZERO" : "",
(map_entry_is_initial(currentp->old_map)) ? " INIT" : "",
currentp->new_map & BTT_MAP_ENTRY_LBA_MASK,
(map_entry_is_error(currentp->new_map)) ? " ERROR" : "",
(map_entry_is_zero(currentp->new_map)) ? " ZERO" : "",
(map_entry_is_initial(currentp->new_map)) ? " INIT" : "");
if (currentp->old_map == currentp->new_map) {
LOG(9, "flog[%u] entry complete (initial state)", flognum);
return 0;
}
uint64_t map_entry_off = arenap->mapoff +
BTT_MAP_ENTRY_SIZE * currentp->lba;
uint32_t entry;
if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &entry,
sizeof(entry), map_entry_off) < 0)
return -1;
entry = le32toh(entry);
if (map_entry_is_initial(entry))
entry = currentp->lba | BTT_MAP_ENTRY_NORMAL;
if (currentp->new_map != entry && currentp->old_map == entry) {
LOG(9, "recover flog[%u]: map[%u]: %u",
flognum, currentp->lba, currentp->new_map);
entry = htole32(currentp->new_map);
if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &entry,
sizeof(uint32_t), map_entry_off) < 0)
return -1;
}
return 0;
}
static int
flog_update(struct btt *bttp, unsigned lane, struct arena *arenap,
uint32_t lba, uint32_t old_map, uint32_t new_map)
{
LOG(3, "bttp %p lane %u arenap %p lba %u old_map %u new_map %u",
bttp, lane, arenap, lba, old_map, new_map);
struct btt_flog new_flog;
new_flog.lba = lba;
new_flog.old_map = old_map;
new_flog.new_map = new_map;
new_flog.seq = NSEQ(arenap->flogs[lane].flog.seq);
btt_flog_convert2le(&new_flog);
uint64_t new_flog_off =
arenap->flogs[lane].entries[arenap->flogs[lane].next];
if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &new_flog,
sizeof(uint32_t) * 2, new_flog_off) < 0)
return -1;
new_flog_off += sizeof(uint32_t) * 2;
if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &new_flog.new_map,
sizeof(uint32_t) * 2, new_flog_off) < 0)
return -1;
arenap->flogs[lane].next = 1 - arenap->flogs[lane].next;
arenap->flogs[lane].flog.lba = lba;
arenap->flogs[lane].flog.old_map = old_map;
arenap->flogs[lane].flog.new_map = new_map;
arenap->flogs[lane].flog.seq = NSEQ(arenap->flogs[lane].flog.seq);
LOG(9, "update flog[%u]: lba %u old %u%s%s%s new %u%s%s%s", lane, lba,
old_map & BTT_MAP_ENTRY_LBA_MASK,
(map_entry_is_error(old_map)) ? " ERROR" : "",
(map_entry_is_zero(old_map)) ? " ZERO" : "",
(map_entry_is_initial(old_map)) ? " INIT" : "",
new_map & BTT_MAP_ENTRY_LBA_MASK,
(map_entry_is_error(new_map)) ? " ERROR" : "",
(map_entry_is_zero(new_map)) ? " ZERO" : "",
(map_entry_is_initial(new_map)) ? " INIT" : "");
return 0;
}
static int
arena_setf(struct btt *bttp, struct arena *arenap, unsigned lane, uint32_t setf)
{
LOG(3, "bttp %p arenap %p lane %u setf 0x%x", bttp, arenap, lane, setf);
util_fetch_and_or32(&arenap->flags, setf);
if (!bttp->laidout) {
return 0;
}
uint64_t arena_off = arenap->startoff;
struct btt_info info;
util_mutex_lock(&arenap->info_lock);
if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &info,
sizeof(info), arena_off) < 0) {
goto err;
}
uint64_t infooff = le64toh(info.infooff);
info.flags |= htole32(setf);
util_checksum(&info, sizeof(info), &info.checksum, 1);
if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info,
sizeof(info), arena_off) < 0) {
goto err;
}
if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info,
sizeof(info), arena_off + infooff) < 0) {
goto err;
}
util_mutex_unlock(&arenap->info_lock);
return 0;
err:
util_mutex_unlock(&arenap->info_lock);
return -1;
}
static int
set_arena_error(struct btt *bttp, struct arena *arenap, unsigned lane)
{
LOG(3, "bttp %p arena %p lane %u", bttp, arenap, lane);
return arena_setf(bttp, arenap, lane, BTTINFO_FLAG_ERROR);
}
static int
read_flogs(struct btt *bttp, unsigned lane, struct arena *arenap)
{
if ((arenap->flogs = Zalloc(bttp->nfree *
sizeof(struct flog_runtime))) == NULL) {
ERR("!Malloc for %u flog entries", bttp->nfree);
return -1;
}
uint64_t flog_off = arenap->flogoff;
struct flog_runtime *flog_runtimep = arenap->flogs;
for (uint32_t i = 0; i < bttp->nfree; i++) {
if (read_flog_pair(bttp, lane, arenap, flog_off,
flog_runtimep, i) < 0) {
set_arena_error(bttp, arenap, lane);
return -1;
}
flog_off += roundup(2 * sizeof(struct btt_flog),
BTT_FLOG_PAIR_ALIGN);
flog_runtimep++;
}
return 0;
}
static int
build_rtt(struct btt *bttp, struct arena *arenap)
{
if ((arenap->rtt = Malloc(bttp->nfree * sizeof(uint32_t)))
== NULL) {
ERR("!Malloc for %d rtt entries", bttp->nfree);
return -1;
}
for (uint32_t lane = 0; lane < bttp->nfree; lane++)
arenap->rtt[lane] = BTT_MAP_ENTRY_ERROR;
util_synchronize();
return 0;
}
static int
build_map_locks(struct btt *bttp, struct arena *arenap)
{
if ((arenap->map_locks =
Malloc(bttp->nfree * sizeof(*arenap->map_locks)))
== NULL) {
ERR("!Malloc for %d map_lock entries", bttp->nfree);
return -1;
}
for (uint32_t lane = 0; lane < bttp->nfree; lane++)
util_mutex_init(&arenap->map_locks[lane]);
return 0;
}
static int
read_arena(struct btt *bttp, unsigned lane, uint64_t arena_off,
struct arena *arenap)
{
LOG(3, "bttp %p lane %u arena_off %" PRIu64 " arenap %p",
bttp, lane, arena_off, arenap);
struct btt_info info;
if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &info, sizeof(info),
arena_off) < 0)
return -1;
arenap->flags = le32toh(info.flags);
arenap->external_nlba = le32toh(info.external_nlba);
arenap->internal_lbasize = le32toh(info.internal_lbasize);
arenap->internal_nlba = le32toh(info.internal_nlba);
arenap->startoff = arena_off;
arenap->dataoff = arena_off + le64toh(info.dataoff);
arenap->mapoff = arena_off + le64toh(info.mapoff);
arenap->flogoff = arena_off + le64toh(info.flogoff);
arenap->nextoff = arena_off + le64toh(info.nextoff);
if (read_flogs(bttp, lane, arenap) < 0)
return -1;
if (build_rtt(bttp, arenap) < 0)
return -1;
if (build_map_locks(bttp, arenap) < 0)
return -1;
util_mutex_init(&arenap->info_lock);
return 0;
}
void
btt_info_convert2h(struct btt_info *infop)
{
infop->flags = le32toh(infop->flags);
infop->major = le16toh(infop->major);
infop->minor = le16toh(infop->minor);
infop->external_lbasize = le32toh(infop->external_lbasize);
infop->external_nlba = le32toh(infop->external_nlba);
infop->internal_lbasize = le32toh(infop->internal_lbasize);
infop->internal_nlba = le32toh(infop->internal_nlba);
infop->nfree = le32toh(infop->nfree);
infop->infosize = le32toh(infop->infosize);
infop->nextoff = le64toh(infop->nextoff);
infop->dataoff = le64toh(infop->dataoff);
infop->mapoff = le64toh(infop->mapoff);
infop->flogoff = le64toh(infop->flogoff);
infop->infooff = le64toh(infop->infooff);
}
void
btt_info_convert2le(struct btt_info *infop)
{
infop->flags = le32toh(infop->flags);
infop->major = le16toh(infop->major);
infop->minor = le16toh(infop->minor);
infop->external_lbasize = le32toh(infop->external_lbasize);
infop->external_nlba = le32toh(infop->external_nlba);
infop->internal_lbasize = le32toh(infop->internal_lbasize);
infop->internal_nlba = le32toh(infop->internal_nlba);
infop->nfree = le32toh(infop->nfree);
infop->infosize = le32toh(infop->infosize);
infop->nextoff = le64toh(infop->nextoff);
infop->dataoff = le64toh(infop->dataoff);
infop->mapoff = le64toh(infop->mapoff);
infop->flogoff = le64toh(infop->flogoff);
infop->infooff = le64toh(infop->infooff);
}
void
btt_flog_convert2h(struct btt_flog *flogp)
{
flogp->lba = le32toh(flogp->lba);
flogp->old_map = le32toh(flogp->old_map);
flogp->new_map = le32toh(flogp->new_map);
flogp->seq = le32toh(flogp->seq);
}
void
btt_flog_convert2le(struct btt_flog *flogp)
{
flogp->lba = htole32(flogp->lba);
flogp->old_map = htole32(flogp->old_map);
flogp->new_map = htole32(flogp->new_map);
flogp->seq = htole32(flogp->seq);
}
static int
read_arenas(struct btt *bttp, unsigned lane, unsigned narena)
{
LOG(3, "bttp %p lane %u narena %d", bttp, lane, narena);
if ((bttp->arenas = Zalloc(narena * sizeof(*bttp->arenas))) == NULL) {
ERR("!Malloc for %u arenas", narena);
goto err;
}
uint64_t arena_off = 0;
struct arena *arenap = bttp->arenas;
for (unsigned i = 0; i < narena; i++) {
if (read_arena(bttp, lane, arena_off, arenap) < 0)
goto err;
arena_off = arenap->nextoff;
arenap++;
}
bttp->laidout = 1;
return 0;
err:
LOG(4, "error clean up");
int oerrno = errno;
if (bttp->arenas) {
for (unsigned i = 0; i < bttp->narena; i++) {
if (bttp->arenas[i].flogs)
Free(bttp->arenas[i].flogs);
if (bttp->arenas[i].rtt)
Free((void *)bttp->arenas[i].rtt);
if (bttp->arenas[i].map_locks)
Free((void *)bttp->arenas[i].map_locks);
}
Free(bttp->arenas);
bttp->arenas = NULL;
}
errno = oerrno;
return -1;
}
static inline uint32_t
internal_lbasize(uint32_t external_lbasize)
{
uint32_t internal_lbasize = external_lbasize;
if (internal_lbasize < BTT_MIN_LBA_SIZE)
internal_lbasize = BTT_MIN_LBA_SIZE;
internal_lbasize =
roundup(internal_lbasize, BTT_INTERNAL_LBA_ALIGNMENT);
if (internal_lbasize < BTT_INTERNAL_LBA_ALIGNMENT) {
errno = EINVAL;
ERR("!Invalid lba size after alignment: %u ", internal_lbasize);
return 0;
}
return internal_lbasize;
}
uint64_t
btt_flog_size(uint32_t nfree)
{
uint64_t flog_size = nfree * roundup(2 * sizeof(struct btt_flog),
BTT_FLOG_PAIR_ALIGN);
return roundup(flog_size, BTT_ALIGNMENT);
}
uint64_t
btt_map_size(uint32_t external_nlba)
{
return roundup(external_nlba * BTT_MAP_ENTRY_SIZE, BTT_ALIGNMENT);
}
uint64_t
btt_arena_datasize(uint64_t arena_size, uint32_t nfree)
{
return arena_size - 2 * sizeof(struct btt_info) - btt_flog_size(nfree);
}
static int
btt_info_set_params(struct btt_info *info, uint32_t external_lbasize,
uint32_t internal_lbasize, uint32_t nfree, uint64_t arena_size)
{
info->external_lbasize = external_lbasize;
info->internal_lbasize = internal_lbasize;
info->nfree = nfree;
info->infosize = sizeof(*info);
uint64_t arena_data_size = btt_arena_datasize(arena_size, nfree);
uint64_t internal_nlba = (arena_data_size - BTT_ALIGNMENT) /
(info->internal_lbasize + BTT_MAP_ENTRY_SIZE);
if (internal_nlba < 2 * nfree) {
errno = EINVAL;
ERR("!number of internal blocks: %" PRIu64
" expected at least %u",
internal_nlba, 2 * nfree);
return -1;
}
ASSERT(internal_nlba <= UINT32_MAX);
uint32_t internal_nlba_u32 = (uint32_t)internal_nlba;
info->internal_nlba = internal_nlba_u32;
info->external_nlba = internal_nlba_u32 - info->nfree;
ASSERT((arena_data_size - btt_map_size(info->external_nlba)) /
internal_lbasize >= internal_nlba);
return 0;
}
static void
btt_info_set_offs(struct btt_info *info, uint64_t arena_size,
uint64_t space_left)
{
info->dataoff = info->infosize;
if (space_left >= BTT_MIN_SIZE)
info->nextoff = arena_size;
else
info->nextoff = 0;
info->infooff = arena_size - sizeof(struct btt_info);
info->flogoff = info->infooff - btt_flog_size(info->nfree);
info->mapoff = info->flogoff - btt_map_size(info->external_nlba);
ASSERTeq(btt_arena_datasize(arena_size, info->nfree) -
btt_map_size(info->external_nlba), info->mapoff -
info->dataoff);
}
int
btt_info_set(struct btt_info *info, uint32_t external_lbasize,
uint32_t nfree, uint64_t arena_size, uint64_t space_left)
{
uint32_t internal_lba_size = internal_lbasize(external_lbasize);
if (internal_lba_size == 0)
return -1;
if (btt_info_set_params(info, external_lbasize,
internal_lba_size, nfree, arena_size))
return -1;
btt_info_set_offs(info, arena_size, space_left);
return 0;
}
static int
write_layout(struct btt *bttp, unsigned lane, int write)
{
LOG(3, "bttp %p lane %u write %d", bttp, lane, write);
ASSERT(bttp->rawsize >= BTT_MIN_SIZE);
ASSERT(bttp->nfree);
if (write) {
int ret = util_uuid_generate(bttp->uuid);
if (ret < 0) {
LOG(2, "util_uuid_generate failed");
return -1;
}
}
bttp->narena = (unsigned)(bttp->rawsize / BTT_MAX_ARENA);
if (bttp->rawsize % BTT_MAX_ARENA >= BTT_MIN_SIZE)
bttp->narena++;
LOG(4, "narena %u", bttp->narena);
uint32_t internal_lba_size = internal_lbasize(bttp->lbasize);
if (internal_lba_size == 0)
return -1;
LOG(4, "adjusted internal_lbasize %u", internal_lba_size);
uint64_t total_nlba = 0;
uint64_t rawsize = bttp->rawsize;
unsigned arena_num = 0;
uint64_t arena_off = 0;
while (rawsize >= BTT_MIN_SIZE) {
LOG(4, "layout arena %u", arena_num);
uint64_t arena_rawsize = rawsize;
if (arena_rawsize > BTT_MAX_ARENA) {
arena_rawsize = BTT_MAX_ARENA;
}
rawsize -= arena_rawsize;
arena_num++;
struct btt_info info;
memset(&info, '\0', sizeof(info));
if (btt_info_set_params(&info, bttp->lbasize,
internal_lba_size, bttp->nfree, arena_rawsize))
return -1;
LOG(4, "internal_nlba %u external_nlba %u",
info.internal_nlba, info.external_nlba);
total_nlba += info.external_nlba;
if (!write)
continue;
btt_info_set_offs(&info, arena_rawsize, rawsize);
LOG(4, "nextoff 0x%016" PRIx64, info.nextoff);
LOG(4, "dataoff 0x%016" PRIx64, info.dataoff);
LOG(4, "mapoff 0x%016" PRIx64, info.mapoff);
LOG(4, "flogoff 0x%016" PRIx64, info.flogoff);
LOG(4, "infooff 0x%016" PRIx64, info.infooff);
if (!bttp->ns_cbp->ns_is_zeroed) {
uint64_t mapsize = btt_map_size(info.external_nlba);
if ((*bttp->ns_cbp->nszero)(bttp->ns, lane, mapsize,
info.mapoff) < 0)
return -1;
}
uint64_t flog_entry_off = arena_off + info.flogoff;
uint32_t next_free_lba = info.external_nlba;
for (uint32_t i = 0; i < bttp->nfree; i++) {
struct btt_flog flog;
flog.lba = htole32(i);
flog.old_map = flog.new_map =
htole32(next_free_lba | BTT_MAP_ENTRY_ZERO);
flog.seq = htole32(1);
LOG(6, "flog[%u] entry off %" PRIu64
" initial %u + zero = %u",
i, flog_entry_off,
next_free_lba,
next_free_lba | BTT_MAP_ENTRY_ZERO);
if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &flog,
sizeof(flog), flog_entry_off) < 0)
return -1;
flog_entry_off += sizeof(flog);
LOG(6, "flog[%u] entry off %" PRIu64 " zeros",
i, flog_entry_off);
if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &Zflog,
sizeof(Zflog), flog_entry_off) < 0)
return -1;
flog_entry_off += sizeof(flog);
flog_entry_off = roundup(flog_entry_off,
BTT_FLOG_PAIR_ALIGN);
next_free_lba++;
}
memcpy(info.sig, Sig, BTTINFO_SIG_LEN);
memcpy(info.uuid, bttp->uuid, BTTINFO_UUID_LEN);
memcpy(info.parent_uuid, bttp->parent_uuid, BTTINFO_UUID_LEN);
info.major = BTTINFO_MAJOR_VERSION;
info.minor = BTTINFO_MINOR_VERSION;
btt_info_convert2le(&info);
util_checksum(&info, sizeof(info), &info.checksum, 1);
if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info,
sizeof(info), arena_off) < 0)
return -1;
if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, &info,
sizeof(info), arena_off + info.infooff) < 0)
return -1;
arena_off += info.nextoff;
}
ASSERTeq(bttp->narena, arena_num);
bttp->nlba = total_nlba;
if (write) {
return read_arenas(bttp, lane, bttp->narena);
}
return 0;
}
static int
read_layout(struct btt *bttp, unsigned lane)
{
LOG(3, "bttp %p", bttp);
ASSERT(bttp->rawsize >= BTT_MIN_SIZE);
unsigned narena = 0;
uint32_t smallest_nfree = UINT32_MAX;
uint64_t rawsize = bttp->rawsize;
uint64_t total_nlba = 0;
uint64_t arena_off = 0;
bttp->nfree = BTT_DEFAULT_NFREE;
while (rawsize >= BTT_MIN_SIZE) {
narena++;
struct btt_info info;
if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &info,
sizeof(info), arena_off) < 0)
return -1;
if (!read_info(bttp, &info)) {
return write_layout(bttp, lane, 0);
}
if (info.external_lbasize != bttp->lbasize) {
ERR("inconsistent lbasize");
errno = EINVAL;
return -1;
}
if (info.nfree == 0) {
ERR("invalid nfree");
errno = EINVAL;
return -1;
}
if (info.external_nlba == 0) {
ERR("invalid external_nlba");
errno = EINVAL;
return -1;
}
if (info.nextoff && (info.nextoff != BTT_MAX_ARENA)) {
ERR("invalid arena size");
errno = EINVAL;
return -1;
}
if (info.nfree < smallest_nfree)
smallest_nfree = info.nfree;
total_nlba += info.external_nlba;
arena_off += info.nextoff;
if (info.nextoff == 0)
break;
if (info.nextoff > rawsize) {
ERR("invalid next arena offset");
errno = EINVAL;
return -1;
}
rawsize -= info.nextoff;
}
ASSERT(narena);
bttp->narena = narena;
bttp->nlba = total_nlba;
if (smallest_nfree < bttp->nfree)
bttp->nfree = smallest_nfree;
return read_arenas(bttp, lane, narena);
}
static int
zero_block(struct btt *bttp, void *buf)
{
LOG(3, "bttp %p", bttp);
memset(buf, '\0', bttp->lbasize);
return 0;
}
static int
lba_to_arena_lba(struct btt *bttp, uint64_t lba,
struct arena **arenapp, uint32_t *premap_lbap)
{
LOG(3, "bttp %p lba %" PRIu64, bttp, lba);
ASSERT(bttp->laidout);
unsigned arena;
for (arena = 0; arena < bttp->narena; arena++)
if (lba < bttp->arenas[arena].external_nlba)
break;
else
lba -= bttp->arenas[arena].external_nlba;
ASSERT(arena < bttp->narena);
*arenapp = &bttp->arenas[arena];
ASSERT(lba <= UINT32_MAX);
*premap_lbap = (uint32_t)lba;
LOG(3, "arenap %p pre-map LBA %u", *arenapp, *premap_lbap);
return 0;
}
struct btt *
btt_init(uint64_t rawsize, uint32_t lbasize, uint8_t parent_uuid[],
unsigned maxlane, void *ns, const struct ns_callback *ns_cbp)
{
LOG(3, "rawsize %" PRIu64 " lbasize %u", rawsize, lbasize);
if (rawsize < BTT_MIN_SIZE) {
ERR("rawsize smaller than BTT_MIN_SIZE %u", BTT_MIN_SIZE);
errno = EINVAL;
return NULL;
}
struct btt *bttp = Zalloc(sizeof(*bttp));
if (bttp == NULL) {
ERR("!Malloc %zu bytes", sizeof(*bttp));
return NULL;
}
util_mutex_init(&bttp->layout_write_mutex);
memcpy(bttp->parent_uuid, parent_uuid, BTTINFO_UUID_LEN);
bttp->rawsize = rawsize;
bttp->lbasize = lbasize;
bttp->ns = ns;
bttp->ns_cbp = ns_cbp;
if (read_layout(bttp, 0) < 0) {
btt_fini(bttp);
return NULL;
}
bttp->nlane = bttp->nfree;
if (maxlane && bttp->nlane > maxlane)
bttp->nlane = maxlane;
LOG(3, "success, bttp %p nlane %u", bttp, bttp->nlane);
return bttp;
}
unsigned
btt_nlane(struct btt *bttp)
{
LOG(3, "bttp %p", bttp);
return bttp->nlane;
}
size_t
btt_nlba(struct btt *bttp)
{
LOG(3, "bttp %p", bttp);
return bttp->nlba;
}
int
btt_read(struct btt *bttp, unsigned lane, uint64_t lba, void *buf)
{
LOG(3, "bttp %p lane %u lba %" PRIu64, bttp, lane, lba);
if (invalid_lba(bttp, lba))
return -1;
if (!bttp->laidout)
return zero_block(bttp, buf);
struct arena *arenap;
uint32_t premap_lba;
uint64_t map_entry_off;
if (lba_to_arena_lba(bttp, lba, &arenap, &premap_lba) < 0)
return -1;
map_entry_off = arenap->mapoff + BTT_MAP_ENTRY_SIZE * premap_lba;
uint32_t entry;
if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &entry,
sizeof(entry), map_entry_off) < 0)
return -1;
entry = le32toh(entry);
while (1) {
if (map_entry_is_error(entry)) {
ERR("EIO due to map entry error flag");
errno = EIO;
return -1;
}
if (map_entry_is_zero_or_initial(entry))
return zero_block(bttp, buf);
arenap->rtt[lane] = entry;
util_synchronize();
uint32_t latest_entry;
if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, &latest_entry,
sizeof(latest_entry), map_entry_off) < 0) {
arenap->rtt[lane] = BTT_MAP_ENTRY_ERROR;
return -1;
}
latest_entry = le32toh(latest_entry);
if (entry == latest_entry)
break;
else
entry = latest_entry;
}
uint64_t data_block_off =
arenap->dataoff + (uint64_t)(entry & BTT_MAP_ENTRY_LBA_MASK) *
arenap->internal_lbasize;
int readret = (*bttp->ns_cbp->nsread)(bttp->ns, lane, buf,
bttp->lbasize, data_block_off);
arenap->rtt[lane] = BTT_MAP_ENTRY_ERROR;
return readret;
}
static int
map_lock(struct btt *bttp, unsigned lane, struct arena *arenap,
uint32_t *entryp, uint32_t premap_lba)
{
LOG(3, "bttp %p lane %u arenap %p premap_lba %u",
bttp, lane, arenap, premap_lba);
uint64_t map_entry_off =
arenap->mapoff + BTT_MAP_ENTRY_SIZE * premap_lba;
uint32_t map_lock_num = get_map_lock_num(premap_lba, bttp->nfree);
util_mutex_lock(&arenap->map_locks[map_lock_num]);
if ((*bttp->ns_cbp->nsread)(bttp->ns, lane, entryp,
sizeof(uint32_t), map_entry_off) < 0) {
util_mutex_unlock(&arenap->map_locks[map_lock_num]);
return -1;
}
if (map_entry_is_initial(*entryp))
*entryp = htole32(premap_lba | BTT_MAP_ENTRY_NORMAL);
LOG(9, "locked map[%d]: %u%s%s", premap_lba,
*entryp & BTT_MAP_ENTRY_LBA_MASK,
(map_entry_is_error(*entryp)) ? " ERROR" : "",
(map_entry_is_zero(*entryp)) ? " ZERO" : "");
return 0;
}
static void
map_abort(struct btt *bttp, unsigned lane, struct arena *arenap,
uint32_t premap_lba)
{
LOG(3, "bttp %p lane %u arenap %p premap_lba %u",
bttp, lane, arenap, premap_lba);
util_mutex_unlock(&arenap->map_locks[get_map_lock_num(premap_lba,
bttp->nfree)]);
}
static int
map_unlock(struct btt *bttp, unsigned lane, struct arena *arenap,
uint32_t entry, uint32_t premap_lba)
{
LOG(3, "bttp %p lane %u arenap %p entry %u premap_lba %u",
bttp, lane, arenap, entry, premap_lba);
uint64_t map_entry_off =
arenap->mapoff + BTT_MAP_ENTRY_SIZE * premap_lba;
int err = (*bttp->ns_cbp->nswrite)(bttp->ns, lane, &entry,
sizeof(uint32_t), map_entry_off);
util_mutex_unlock(&arenap->map_locks[get_map_lock_num(premap_lba,
bttp->nfree)]);
LOG(9, "unlocked map[%d]: %u%s%s", premap_lba,
entry & BTT_MAP_ENTRY_LBA_MASK,
(map_entry_is_error(entry)) ? " ERROR" : "",
(map_entry_is_zero(entry)) ? " ZERO" : "");
return err;
}
int
btt_write(struct btt *bttp, unsigned lane, uint64_t lba, const void *buf)
{
LOG(3, "bttp %p lane %u lba %" PRIu64, bttp, lane, lba);
if (invalid_lba(bttp, lba))
return -1;
if (!bttp->laidout) {
int err = 0;
util_mutex_lock(&bttp->layout_write_mutex);
if (!bttp->laidout)
err = write_layout(bttp, lane, 1);
util_mutex_unlock(&bttp->layout_write_mutex);
if (err < 0)
return err;
}
struct arena *arenap;
uint32_t premap_lba;
if (lba_to_arena_lba(bttp, lba, &arenap, &premap_lba) < 0)
return -1;
if (arenap->flags & BTTINFO_FLAG_ERROR_MASK) {
ERR("EIO due to btt_info error flags 0x%x",
arenap->flags & BTTINFO_FLAG_ERROR_MASK);
errno = EIO;
return -1;
}
uint32_t free_entry = (arenap->flogs[lane].flog.old_map &
BTT_MAP_ENTRY_LBA_MASK) | BTT_MAP_ENTRY_NORMAL;
LOG(3, "free_entry %u (before mask %u)", free_entry,
arenap->flogs[lane].flog.old_map);
for (unsigned i = 0; i < bttp->nlane; i++)
while (arenap->rtt[i] == free_entry)
;
uint64_t data_block_off = arenap->dataoff +
(uint64_t)(free_entry & BTT_MAP_ENTRY_LBA_MASK) *
arenap->internal_lbasize;
if ((*bttp->ns_cbp->nswrite)(bttp->ns, lane, buf,
bttp->lbasize, data_block_off) < 0)
return -1;
uint32_t old_entry;
if (map_lock(bttp, lane, arenap, &old_entry, premap_lba) < 0)
return -1;
old_entry = le32toh(old_entry);
if (flog_update(bttp, lane, arenap, premap_lba,
old_entry, free_entry) < 0) {
map_abort(bttp, lane, arenap, premap_lba);
return -1;
}
if (map_unlock(bttp, lane, arenap, htole32(free_entry),
premap_lba) < 0) {
set_arena_error(bttp, arenap, lane);
errno = EIO;
return -1;
}
return 0;
}
static int
map_entry_setf(struct btt *bttp, unsigned lane, uint64_t lba, uint32_t setf)
{
LOG(3, "bttp %p lane %u lba %" PRIu64 " setf 0x%x",
bttp, lane, lba, setf);
if (invalid_lba(bttp, lba))
return -1;
if (!bttp->laidout) {
if (setf == BTT_MAP_ENTRY_ZERO)
return 0;
int err = 0;
util_mutex_lock(&bttp->layout_write_mutex);
if (!bttp->laidout)
err = write_layout(bttp, lane, 1);
util_mutex_unlock(&bttp->layout_write_mutex);
if (err < 0)
return err;
}
struct arena *arenap;
uint32_t premap_lba;
if (lba_to_arena_lba(bttp, lba, &arenap, &premap_lba) < 0)
return -1;
if (arenap->flags & BTTINFO_FLAG_ERROR_MASK) {
ERR("EIO due to btt_info error flags 0x%x",
arenap->flags & BTTINFO_FLAG_ERROR_MASK);
errno = EIO;
return -1;
}
uint32_t old_entry;
uint32_t new_entry;
if (map_lock(bttp, lane, arenap, &old_entry, premap_lba) < 0)
return -1;
old_entry = le32toh(old_entry);
if (setf == BTT_MAP_ENTRY_ZERO &&
map_entry_is_zero_or_initial(old_entry)) {
map_abort(bttp, lane, arenap, premap_lba);
return 0;
}
new_entry = (old_entry & BTT_MAP_ENTRY_LBA_MASK) | setf;
if (map_unlock(bttp, lane, arenap, htole32(new_entry), premap_lba) < 0)
return -1;
return 0;
}
int
btt_set_zero(struct btt *bttp, unsigned lane, uint64_t lba)
{
LOG(3, "bttp %p lane %u lba %" PRIu64, bttp, lane, lba);
return map_entry_setf(bttp, lane, lba, BTT_MAP_ENTRY_ZERO);
}
int
btt_set_error(struct btt *bttp, unsigned lane, uint64_t lba)
{
LOG(3, "bttp %p lane %u lba %" PRIu64, bttp, lane, lba);
return map_entry_setf(bttp, lane, lba, BTT_MAP_ENTRY_ERROR);
}
static int
check_arena(struct btt *bttp, struct arena *arenap)
{
LOG(3, "bttp %p arenap %p", bttp, arenap);
int consistent = 1;
uint64_t map_entry_off = arenap->mapoff;
uint32_t bitmapsize = howmany(arenap->internal_nlba, 8);
uint8_t *bitmap = Zalloc(bitmapsize);
if (bitmap == NULL) {
ERR("!Malloc for bitmap");
return -1;
}
uint32_t *mapp = NULL;
ssize_t mlen;
int next_index = 0;
size_t remaining = 0;
for (uint32_t i = 0; i < arenap->external_nlba; i++) {
uint32_t entry;
if (remaining == 0) {
size_t req_len =
(arenap->external_nlba - i) * sizeof(uint32_t);
mlen = (*bttp->ns_cbp->nsmap)(bttp->ns, 0,
(void **)&mapp, req_len, map_entry_off);
if (mlen < 0)
return -1;
remaining = (size_t)mlen;
next_index = 0;
}
entry = le32toh(mapp[next_index]);
if (map_entry_is_zero_or_initial(entry) == 0)
LOG(11, "map[%d]: %u%s", i,
entry & BTT_MAP_ENTRY_LBA_MASK,
(map_entry_is_error(entry)) ? " ERROR" : "");
if (map_entry_is_initial(entry))
entry = i;
else
entry &= BTT_MAP_ENTRY_LBA_MASK;
if (entry >= arenap->internal_nlba) {
ERR("map[%d] entry out of bounds: %u", i, entry);
errno = EINVAL;
return -1;
}
if (util_isset(bitmap, entry)) {
ERR("map[%d] duplicate entry: %u", i, entry);
consistent = 0;
} else
util_setbit(bitmap, entry);
map_entry_off += sizeof(uint32_t);
next_index++;
ASSERT(remaining >= sizeof(uint32_t));
remaining -= sizeof(uint32_t);
}
for (uint32_t i = 0; i < bttp->nfree; i++) {
uint32_t entry = arenap->flogs[i].flog.old_map;
entry &= BTT_MAP_ENTRY_LBA_MASK;
if (util_isset(bitmap, entry)) {
ERR("flog[%u] duplicate entry: %u", i, entry);
consistent = 0;
} else
util_setbit(bitmap, entry);
}
for (uint32_t i = 0; i < arenap->internal_nlba; i++)
if (util_isclr(bitmap, i)) {
ERR("unreferenced lba: %d", i);
consistent = 0;
}
Free(bitmap);
return consistent;
}
int
btt_check(struct btt *bttp)
{
LOG(3, "bttp %p", bttp);
int consistent = 1;
if (!bttp->laidout) {
LOG(3, "no layout yet");
return consistent;
}
struct arena *arenap = bttp->arenas;
for (unsigned i = 0; i < bttp->narena; i++, arenap++) {
int retval = check_arena(bttp, arenap);
if (retval < 0)
return retval;
else if (retval == 0)
consistent = 0;
}
return consistent;
}
void
btt_fini(struct btt *bttp)
{
LOG(3, "bttp %p", bttp);
if (bttp->arenas) {
for (unsigned i = 0; i < bttp->narena; i++) {
if (bttp->arenas[i].flogs)
Free(bttp->arenas[i].flogs);
if (bttp->arenas[i].rtt)
Free((void *)bttp->arenas[i].rtt);
if (bttp->arenas[i].rtt)
Free((void *)bttp->arenas[i].map_locks);
}
Free(bttp->arenas);
}
Free(bttp);
}