#include <inttypes.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/param.h>
#include <unistd.h>
#include <errno.h>
#include <time.h>
#include <stdint.h>
#include <endian.h>
#include "libpmem.h"
#include "libpmemblk.h"
#include "mmap.h"
#include "set.h"
#include "out.h"
#include "btt.h"
#include "blk.h"
#include "util.h"
#include "sys_util.h"
#include "util_pmem.h"
#include "valgrind_internal.h"
static void
lane_enter(PMEMblkpool *pbp, unsigned *lane)
{
unsigned mylane;
mylane = util_fetch_and_add32(&pbp->next_lane, 1) % pbp->nlane;
util_mutex_lock(&pbp->locks[mylane]);
*lane = mylane;
}
static void
lane_exit(PMEMblkpool *pbp, unsigned mylane)
{
util_mutex_unlock(&pbp->locks[mylane]);
}
static int
nsread(void *ns, unsigned lane, void *buf, size_t count, uint64_t off)
{
struct pmemblk *pbp = (struct pmemblk *)ns;
LOG(13, "pbp %p lane %u count %zu off %" PRIu64, pbp, lane, count, off);
if (off + count > pbp->datasize) {
ERR("offset + count (%zu) past end of data area (%zu)",
(size_t)off + count, pbp->datasize);
errno = EINVAL;
return -1;
}
memcpy(buf, (char *)pbp->data + off, count);
return 0;
}
static int
nswrite(void *ns, unsigned lane, const void *buf, size_t count,
uint64_t off)
{
struct pmemblk *pbp = (struct pmemblk *)ns;
LOG(13, "pbp %p lane %u count %zu off %" PRIu64, pbp, lane, count, off);
if (off + count > pbp->datasize) {
ERR("offset + count (%zu) past end of data area (%zu)",
(size_t)off + count, pbp->datasize);
errno = EINVAL;
return -1;
}
void *dest = (char *)pbp->data + off;
#ifdef DEBUG
util_mutex_lock(&pbp->write_lock);
#endif
RANGE_RW(dest, count, pbp->is_dev_dax);
if (pbp->is_pmem)
pmem_memcpy_nodrain(dest, buf, count);
else
memcpy(dest, buf, count);
RANGE_RO(dest, count, pbp->is_dev_dax);
#ifdef DEBUG
util_mutex_unlock(&pbp->write_lock);
#endif
if (pbp->is_pmem)
pmem_drain();
else
pmem_msync(dest, count);
return 0;
}
static ssize_t
nsmap(void *ns, unsigned lane, void **addrp, size_t len, uint64_t off)
{
struct pmemblk *pbp = (struct pmemblk *)ns;
LOG(12, "pbp %p lane %u len %zu off %" PRIu64, pbp, lane, len, off);
ASSERT(((ssize_t)len) >= 0);
if (off + len >= pbp->datasize) {
ERR("offset + len (%zu) past end of data area (%zu)",
(size_t)off + len, pbp->datasize - 1);
errno = EINVAL;
return -1;
}
*addrp = (char *)pbp->data + off;
LOG(12, "returning addr %p", *addrp);
return (ssize_t)len;
}
static void
nssync(void *ns, unsigned lane, void *addr, size_t len)
{
struct pmemblk *pbp = (struct pmemblk *)ns;
LOG(12, "pbp %p lane %u addr %p len %zu", pbp, lane, addr, len);
if (pbp->is_pmem)
pmem_persist(addr, len);
else
pmem_msync(addr, len);
}
static int
nszero(void *ns, unsigned lane, size_t count, uint64_t off)
{
struct pmemblk *pbp = (struct pmemblk *)ns;
LOG(13, "pbp %p lane %u count %zu off %" PRIu64, pbp, lane, count, off);
if (off + count > pbp->datasize) {
ERR("offset + count (%zu) past end of data area (%zu)",
(size_t)off + count, pbp->datasize);
errno = EINVAL;
return -1;
}
void *dest = (char *)pbp->data + off;
RANGE_RW(dest, count, pbp->is_dev_dax);
pmem_memset_persist(dest, 0, count);
RANGE_RO(dest, count, pbp->is_dev_dax);
return 0;
}
static struct ns_callback ns_cb = {
.nsread = nsread,
.nswrite = nswrite,
.nszero = nszero,
.nsmap = nsmap,
.nssync = nssync,
.ns_is_zeroed = 0
};
static void
blk_descr_create(PMEMblkpool *pbp, uint32_t bsize, int zeroed)
{
LOG(3, "pbp %p bsize %u zeroed %d", pbp, bsize, zeroed);
pbp->bsize = htole32(bsize);
util_persist(pbp->is_pmem, &pbp->bsize, sizeof(bsize));
pbp->is_zeroed = zeroed;
util_persist(pbp->is_pmem, &pbp->is_zeroed, sizeof(pbp->is_zeroed));
}
static int
blk_descr_check(PMEMblkpool *pbp, size_t *bsize)
{
LOG(3, "pbp %p bsize %zu", pbp, *bsize);
size_t hdr_bsize = le32toh(pbp->bsize);
if (*bsize && *bsize != hdr_bsize) {
ERR("wrong bsize (%zu), pool created with bsize %zu",
*bsize, hdr_bsize);
errno = EINVAL;
return -1;
}
*bsize = hdr_bsize;
LOG(3, "using block size from header: %zu", *bsize);
return 0;
}
static int
blk_runtime_init(PMEMblkpool *pbp, size_t bsize, int rdonly)
{
LOG(3, "pbp %p bsize %zu rdonly %d",
pbp, bsize, rdonly);
VALGRIND_REMOVE_PMEM_MAPPING(&pbp->addr,
sizeof(struct pmemblk) -
sizeof(struct pool_hdr) -
sizeof(pbp->bsize) -
sizeof(pbp->is_zeroed));
pbp->rdonly = rdonly;
pbp->data = (char *)pbp->addr +
roundup(sizeof(*pbp), BLK_FORMAT_DATA_ALIGN);
ASSERT(((char *)pbp->addr + pbp->size) >= (char *)pbp->data);
pbp->datasize = (size_t)
(((char *)pbp->addr + pbp->size) - (char *)pbp->data);
LOG(4, "data area %p data size %zu bsize %zu",
pbp->data, pbp->datasize, bsize);
long ncpus = sysconf(_SC_NPROCESSORS_ONLN);
if (ncpus < 1)
ncpus = 1;
ns_cb.ns_is_zeroed = pbp->is_zeroed;
struct btt *bttp = NULL;
os_mutex_t *locks = NULL;
bttp = btt_init(pbp->datasize, (uint32_t)bsize, pbp->hdr.poolset_uuid,
(unsigned)ncpus * 2, pbp, &ns_cb);
if (bttp == NULL)
goto err;
pbp->bttp = bttp;
pbp->nlane = btt_nlane(pbp->bttp);
pbp->next_lane = 0;
if ((locks = Malloc(pbp->nlane * sizeof(*locks))) == NULL) {
ERR("!Malloc for lane locks");
goto err;
}
for (unsigned i = 0; i < pbp->nlane; i++)
util_mutex_init(&locks[i]);
pbp->locks = locks;
#ifdef DEBUG
util_mutex_init(&pbp->write_lock);
#endif
RANGE_NONE(pbp->addr, sizeof(struct pool_hdr), pbp->is_dev_dax);
RANGE_RO(pbp->data, pbp->datasize, pbp->is_dev_dax);
return 0;
err:
LOG(4, "error clean up");
int oerrno = errno;
if (bttp)
btt_fini(bttp);
errno = oerrno;
return -1;
}
#ifndef _WIN32
static inline
#endif
PMEMblkpool *
pmemblk_createU(const char *path, size_t bsize, size_t poolsize, mode_t mode)
{
LOG(3, "path %s bsize %zu poolsize %zu mode %o",
path, bsize, poolsize, mode);
if (bsize == 0) {
ERR("Invalid block size %zu", bsize);
errno = EINVAL;
return NULL;
}
if (bsize > UINT32_MAX) {
ERR("Invalid block size %zu", bsize);
errno = EINVAL;
return NULL;
}
struct pool_set *set;
if (util_pool_create(&set, path,
poolsize, PMEMBLK_MIN_POOL, PMEMBLK_MIN_PART,
BLK_HDR_SIG, BLK_FORMAT_MAJOR,
BLK_FORMAT_COMPAT_DEFAULT, BLK_FORMAT_INCOMPAT_DEFAULT,
BLK_FORMAT_RO_COMPAT_DEFAULT, NULL,
REPLICAS_DISABLED) != 0) {
LOG(2, "cannot create pool or pool set");
return NULL;
}
ASSERT(set->nreplicas > 0);
struct pool_replica *rep = set->replica[0];
PMEMblkpool *pbp = rep->part[0].addr;
VALGRIND_REMOVE_PMEM_MAPPING(&pbp->addr,
sizeof(struct pmemblk) -
((uintptr_t)&pbp->addr - (uintptr_t)&pbp->hdr));
pbp->addr = pbp;
pbp->size = rep->repsize;
pbp->set = set;
pbp->is_pmem = rep->is_pmem;
pbp->is_dev_dax = rep->part[0].is_dev_dax;
ASSERT(!pbp->is_dev_dax || pbp->is_pmem);
blk_descr_create(pbp, (uint32_t)bsize, set->zeroed);
if (blk_runtime_init(pbp, bsize, 0) != 0) {
ERR("pool initialization failed");
goto err;
}
if (util_poolset_chmod(set, mode))
goto err;
util_poolset_fdclose(set);
LOG(3, "pbp %p", pbp);
return pbp;
err:
LOG(4, "error clean up");
int oerrno = errno;
util_poolset_close(set, DELETE_CREATED_PARTS);
errno = oerrno;
return NULL;
}
#ifndef _WIN32
PMEMblkpool *
pmemblk_create(const char *path, size_t bsize, size_t poolsize, mode_t mode)
{
return pmemblk_createU(path, bsize, poolsize, mode);
}
#else
PMEMblkpool *
pmemblk_createW(const wchar_t *path, size_t bsize, size_t poolsize,
mode_t mode)
{
char *upath = util_toUTF8(path);
if (upath == NULL)
return NULL;
PMEMblkpool *ret = pmemblk_createU(upath, bsize, poolsize, mode);
util_free_UTF8(upath);
return ret;
}
#endif
static PMEMblkpool *
blk_open_common(const char *path, size_t bsize, int cow)
{
LOG(3, "path %s bsize %zu cow %d", path, bsize, cow);
struct pool_set *set;
if (util_pool_open(&set, path, cow, PMEMBLK_MIN_PART,
BLK_HDR_SIG, BLK_FORMAT_MAJOR,
BLK_FORMAT_COMPAT_CHECK, BLK_FORMAT_INCOMPAT_CHECK,
BLK_FORMAT_RO_COMPAT_CHECK, NULL, NULL) != 0) {
LOG(2, "cannot open pool or pool set");
return NULL;
}
ASSERT(set->nreplicas > 0);
struct pool_replica *rep = set->replica[0];
PMEMblkpool *pbp = rep->part[0].addr;
VALGRIND_REMOVE_PMEM_MAPPING(&pbp->addr,
sizeof(struct pmemblk) -
((uintptr_t)&pbp->addr - (uintptr_t)&pbp->hdr));
pbp->addr = pbp;
pbp->size = rep->repsize;
pbp->set = set;
pbp->is_pmem = rep->is_pmem;
pbp->is_dev_dax = rep->part[0].is_dev_dax;
ASSERT(!pbp->is_dev_dax || pbp->is_pmem);
if (set->nreplicas > 1) {
errno = ENOTSUP;
ERR("!replicas not supported");
goto err;
}
if (blk_descr_check(pbp, &bsize) != 0) {
LOG(2, "descriptor check failed");
goto err;
}
if (blk_runtime_init(pbp, bsize, set->rdonly) != 0) {
ERR("pool initialization failed");
goto err;
}
util_poolset_fdclose(set);
LOG(3, "pbp %p", pbp);
return pbp;
err:
LOG(4, "error clean up");
int oerrno = errno;
util_poolset_close(set, DO_NOT_DELETE_PARTS);
errno = oerrno;
return NULL;
}
#ifndef _WIN32
static inline
#endif
PMEMblkpool *
pmemblk_openU(const char *path, size_t bsize)
{
LOG(3, "path %s bsize %zu", path, bsize);
return blk_open_common(path, bsize, 0);
}
#ifndef _WIN32
PMEMblkpool *
pmemblk_open(const char *path, size_t bsize)
{
return pmemblk_openU(path, bsize);
}
#else
PMEMblkpool *
pmemblk_openW(const wchar_t *path, size_t bsize)
{
char *upath = util_toUTF8(path);
if (upath == NULL)
return NULL;
PMEMblkpool *ret = pmemblk_openU(upath, bsize);
util_free_UTF8(upath);
return ret;
}
#endif
void
pmemblk_close(PMEMblkpool *pbp)
{
LOG(3, "pbp %p", pbp);
btt_fini(pbp->bttp);
if (pbp->locks) {
for (unsigned i = 0; i < pbp->nlane; i++)
os_mutex_destroy(&pbp->locks[i]);
Free((void *)pbp->locks);
}
#ifdef DEBUG
os_mutex_destroy(&pbp->write_lock);
#endif
util_poolset_close(pbp->set, DO_NOT_DELETE_PARTS);
}
size_t
pmemblk_bsize(PMEMblkpool *pbp)
{
LOG(3, "pbp %p", pbp);
return le32toh(pbp->bsize);
}
size_t
pmemblk_nblock(PMEMblkpool *pbp)
{
LOG(3, "pbp %p", pbp);
return btt_nlba(pbp->bttp);
}
int
pmemblk_read(PMEMblkpool *pbp, void *buf, long long blockno)
{
LOG(3, "pbp %p buf %p blockno %lld", pbp, buf, blockno);
if (blockno < 0) {
ERR("negative block number");
errno = EINVAL;
return -1;
}
unsigned lane;
lane_enter(pbp, &lane);
int err = btt_read(pbp->bttp, lane, (uint64_t)blockno, buf);
lane_exit(pbp, lane);
return err;
}
int
pmemblk_write(PMEMblkpool *pbp, const void *buf, long long blockno)
{
LOG(3, "pbp %p buf %p blockno %lld", pbp, buf, blockno);
if (pbp->rdonly) {
ERR("EROFS (pool is read-only)");
errno = EROFS;
return -1;
}
if (blockno < 0) {
ERR("negative block number");
errno = EINVAL;
return -1;
}
unsigned lane;
lane_enter(pbp, &lane);
int err = btt_write(pbp->bttp, lane, (uint64_t)blockno, buf);
lane_exit(pbp, lane);
return err;
}
int
pmemblk_set_zero(PMEMblkpool *pbp, long long blockno)
{
LOG(3, "pbp %p blockno %lld", pbp, blockno);
if (pbp->rdonly) {
ERR("EROFS (pool is read-only)");
errno = EROFS;
return -1;
}
if (blockno < 0) {
ERR("negative block number");
errno = EINVAL;
return -1;
}
unsigned lane;
lane_enter(pbp, &lane);
int err = btt_set_zero(pbp->bttp, lane, (uint64_t)blockno);
lane_exit(pbp, lane);
return err;
}
int
pmemblk_set_error(PMEMblkpool *pbp, long long blockno)
{
LOG(3, "pbp %p blockno %lld", pbp, blockno);
if (pbp->rdonly) {
ERR("EROFS (pool is read-only)");
errno = EROFS;
return -1;
}
if (blockno < 0) {
ERR("negative block number");
errno = EINVAL;
return -1;
}
unsigned lane;
lane_enter(pbp, &lane);
int err = btt_set_error(pbp->bttp, lane, (uint64_t)blockno);
lane_exit(pbp, lane);
return err;
}
#ifndef _WIN32
static inline
#endif
int
pmemblk_checkU(const char *path, size_t bsize)
{
LOG(3, "path \"%s\" bsize %zu", path, bsize);
PMEMblkpool *pbp = blk_open_common(path, bsize, 1);
if (pbp == NULL)
return -1;
int retval = btt_check(pbp->bttp);
int oerrno = errno;
pmemblk_close(pbp);
errno = oerrno;
return retval;
}
#ifndef _WIN32
int
pmemblk_check(const char *path, size_t bsize)
{
return pmemblk_checkU(path, bsize);
}
#else
int
pmemblk_checkW(const wchar_t *path, size_t bsize)
{
char *upath = util_toUTF8(path);
if (upath == NULL)
return -1;
int ret = pmemblk_checkU(upath, bsize);
util_free_UTF8(upath);
return ret;
}
#endif