#include "db_config.h"
#include "db_int.h"
#include "dbinc/db_page.h"
#include "dbinc/db_join.h"
#include "dbinc/btree.h"
#include "dbinc/lock.h"
static int __db_join_close_pp __P((DBC *));
static int __db_join_cmp __P((const void *, const void *));
static int __db_join_del __P((DBC *, u_int32_t));
static int __db_join_get __P((DBC *, DBT *, DBT *, u_int32_t));
static int __db_join_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
static int __db_join_getnext __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t));
static int __db_join_primget __P((DB *, DB_THREAD_INFO *,
DB_TXN *, DB_LOCKER *, DBT *, DBT *, u_int32_t));
static int __db_join_put __P((DBC *, DBT *, DBT *, u_int32_t));
#define SORTED_SET(jc, n) ((jc)->j_curslist[(n)]->dbp->dup_compare != NULL)
int
__db_join(primary, curslist, dbcp, flags)
DB *primary;
DBC **curslist, **dbcp;
u_int32_t flags;
{
DBC *dbc;
ENV *env;
JOIN_CURSOR *jc;
size_t ncurs, nslots;
u_int32_t i;
int ret;
env = primary->env;
dbc = NULL;
jc = NULL;
if ((ret = __os_calloc(env, 1, sizeof(DBC), &dbc)) != 0)
goto err;
if ((ret = __os_calloc(env, 1, sizeof(JOIN_CURSOR), &jc)) != 0)
goto err;
if ((ret = __os_malloc(env, 256, &jc->j_key.data)) != 0)
goto err;
jc->j_key.ulen = 256;
F_SET(&jc->j_key, DB_DBT_USERMEM);
F_SET(&jc->j_rdata, DB_DBT_REALLOC);
for (jc->j_curslist = curslist;
*jc->j_curslist != NULL; jc->j_curslist++)
;
ncurs = (size_t)(jc->j_curslist - curslist);
nslots = ncurs + 1;
jc->j_curslist = NULL;
jc->j_workcurs = NULL;
jc->j_fdupcurs = NULL;
jc->j_exhausted = NULL;
if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
&jc->j_curslist)) != 0)
goto err;
if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
&jc->j_workcurs)) != 0)
goto err;
if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
&jc->j_fdupcurs)) != 0)
goto err;
if ((ret = __os_calloc(env, nslots, sizeof(u_int8_t),
&jc->j_exhausted)) != 0)
goto err;
for (i = 0; curslist[i] != NULL; i++) {
jc->j_curslist[i] = curslist[i];
jc->j_workcurs[i] = NULL;
jc->j_fdupcurs[i] = NULL;
jc->j_exhausted[i] = 0;
}
jc->j_ncurs = (u_int32_t)ncurs;
if (!LF_ISSET(DB_JOIN_NOSORT))
qsort(jc->j_curslist, ncurs, sizeof(DBC *), __db_join_cmp);
if ((ret =
__dbc_dup(jc->j_curslist[0], jc->j_workcurs, DB_POSITION)) != 0)
goto err;
dbc->close = dbc->c_close = __db_join_close_pp;
dbc->del = dbc->c_del = __db_join_del;
dbc->get = dbc->c_get = __db_join_get_pp;
dbc->put = dbc->c_put = __db_join_put;
dbc->internal = (DBC_INTERNAL *)jc;
dbc->dbp = primary;
jc->j_primary = primary;
dbc->txn = curslist[0]->txn;
*dbcp = dbc;
MUTEX_LOCK(env, primary->mutex);
TAILQ_INSERT_TAIL(&primary->join_queue, dbc, links);
MUTEX_UNLOCK(env, primary->mutex);
return (0);
err: if (jc != NULL) {
if (jc->j_curslist != NULL)
__os_free(env, jc->j_curslist);
if (jc->j_workcurs != NULL) {
if (jc->j_workcurs[0] != NULL)
(void)__dbc_close(jc->j_workcurs[0]);
__os_free(env, jc->j_workcurs);
}
if (jc->j_fdupcurs != NULL)
__os_free(env, jc->j_fdupcurs);
if (jc->j_exhausted != NULL)
__os_free(env, jc->j_exhausted);
__os_free(env, jc);
}
if (dbc != NULL)
__os_free(env, dbc);
return (ret);
}
static int
__db_join_close_pp(dbc)
DBC *dbc;
{
DB *dbp;
DB_THREAD_INFO *ip;
ENV *env;
int handle_check, ret, t_ret;
dbp = dbc->dbp;
env = dbp->env;
ENV_ENTER(env, ip);
handle_check = IS_ENV_REPLICATED(env);
if (handle_check &&
(ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(dbc->txn))) != 0) {
handle_check = 0;
goto err;
}
ret = __db_join_close(dbc);
if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
ret = t_ret;
err: ENV_LEAVE(env, ip);
return (ret);
}
static int
__db_join_put(dbc, key, data, flags)
DBC *dbc;
DBT *key;
DBT *data;
u_int32_t flags;
{
COMPQUIET(dbc, NULL);
COMPQUIET(key, NULL);
COMPQUIET(data, NULL);
COMPQUIET(flags, 0);
return (EINVAL);
}
static int
__db_join_del(dbc, flags)
DBC *dbc;
u_int32_t flags;
{
COMPQUIET(dbc, NULL);
COMPQUIET(flags, 0);
return (EINVAL);
}
static int
__db_join_get_pp(dbc, key, data, flags)
DBC *dbc;
DBT *key, *data;
u_int32_t flags;
{
DB *dbp;
DB_THREAD_INFO *ip;
ENV *env;
u_int32_t handle_check, save_flags;
int ret, t_ret;
dbp = dbc->dbp;
env = dbp->env;
save_flags = flags;
if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) {
if (!LOCKING_ON(env))
return (__db_fnl(env, "DBC->get"));
LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
}
switch (flags) {
case 0:
case DB_JOIN_ITEM:
break;
default:
return (__db_ferr(env, "DBC->get", 0));
}
if (F_ISSET(key, DB_DBT_PARTIAL)) {
__db_errx(env, DB_STR("0516",
"DB_DBT_PARTIAL may not be set on key during join_get"));
return (EINVAL);
}
ENV_ENTER(env, ip);
handle_check = IS_ENV_REPLICATED(env);
if (handle_check &&
(ret = __db_rep_enter(dbp, 1, 0, IS_REAL_TXN(dbc->txn))) != 0) {
handle_check = 0;
goto err;
}
flags = save_flags;
ret = __db_join_get(dbc, key, data, flags);
if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
ret = t_ret;
err: ENV_LEAVE(env, ip);
__dbt_userfree(env, key, NULL, NULL);
return (ret);
}
static int
__db_join_get(dbc, key_arg, data_arg, flags)
DBC *dbc;
DBT *key_arg, *data_arg;
u_int32_t flags;
{
DB *dbp;
DBC *cp;
DBT *key_n, key_n_mem;
ENV *env;
JOIN_CURSOR *jc;
int db_manage_data, ret;
u_int32_t i, j, operation, opmods;
dbp = dbc->dbp;
env = dbp->env;
jc = (JOIN_CURSOR *)dbc->internal;
operation = LF_ISSET(DB_OPFLAGS_MASK);
opmods = LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
if (F_ISSET(key_arg,
DB_DBT_MALLOC | DB_DBT_USERCOPY | DB_DBT_USERMEM)) {
key_n = &key_n_mem;
memset(key_n, 0, sizeof(DBT));
} else {
key_n = key_arg;
}
if (F_ISSET(key_arg, DB_DBT_USERCOPY))
key_arg->data = NULL;
if (F_ISSET(jc, JOIN_RETRY))
goto samekey;
F_CLR(jc, JOIN_RETRY);
retry: ret = __dbc_get(jc->j_workcurs[0], &jc->j_key, key_n,
opmods | (jc->j_exhausted[0] ? DB_NEXT_DUP : DB_CURRENT));
if (ret == DB_BUFFER_SMALL) {
jc->j_key.ulen <<= 1;
if ((ret = __os_realloc(env,
jc->j_key.ulen, &jc->j_key.data)) != 0)
goto mem_err;
goto retry;
}
if (ret != 0)
goto err;
for (i = 1; i < jc->j_ncurs; i++) {
if (jc->j_fdupcurs[i] != NULL &&
(ret = __dbc_close(jc->j_fdupcurs[i])) != 0)
goto err;
jc->j_fdupcurs[i] = NULL;
}
if (jc->j_curslist[1] == NULL)
jc->j_exhausted[0] = 1;
else
jc->j_exhausted[0] = 0;
for (i = 1; i < jc->j_ncurs; i++) {
DB_ASSERT(env, jc->j_curslist[i] != NULL);
if (jc->j_workcurs[i] == NULL)
if ((ret = __dbc_dup(jc->j_curslist[i],
&jc->j_workcurs[i], DB_POSITION)) != 0)
goto err;
retry2: cp = jc->j_workcurs[i];
if ((ret = __db_join_getnext(cp, &jc->j_key, key_n,
jc->j_exhausted[i], opmods)) == DB_NOTFOUND) {
--i;
jc->j_exhausted[i] = 1;
if (i == 0) {
for (j = 1; jc->j_workcurs[j] != NULL; j++) {
if ((ret = __dbc_close(
jc->j_workcurs[j])) != 0)
goto err;
if (!SORTED_SET(jc, 0) ||
!SORTED_SET(jc, j) ||
jc->j_fdupcurs[j] == NULL)
jc->j_workcurs[j] = NULL;
else
if ((__dbc_dup(
jc->j_fdupcurs[j],
&jc->j_workcurs[j],
DB_POSITION)) != 0)
goto err;
jc->j_exhausted[j] = 0;
}
goto retry;
}
for (j = i + 1;
jc->j_workcurs[j] != NULL;
j++) {
if ((ret =
__dbc_close(jc->j_workcurs[j])) != 0)
goto err;
jc->j_exhausted[j] = 0;
if (jc->j_fdupcurs[j] == NULL)
jc->j_workcurs[j] = NULL;
else if ((ret = __dbc_dup(jc->j_fdupcurs[j],
&jc->j_workcurs[j], DB_POSITION)) != 0)
goto err;
}
goto retry2;
}
if (ret == DB_BUFFER_SMALL) {
jc->j_key.ulen <<= 1;
if ((ret = __os_realloc(env, jc->j_key.ulen,
&jc->j_key.data)) != 0) {
mem_err: __db_errx(env, DB_STR_A("0517",
"Allocation failed for join key, len = %lu",
"%lu"), (u_long)jc->j_key.ulen);
goto err;
}
goto retry2;
}
if (ret != 0)
goto err;
if (i + 1 != jc->j_ncurs)
jc->j_exhausted[i] = 0;
else
jc->j_exhausted[i] = 1;
if (SORTED_SET(jc, i) && jc->j_fdupcurs[i] == NULL && (ret =
__dbc_dup(cp, &jc->j_fdupcurs[i], DB_POSITION)) != 0)
goto err;
}
err: if (ret != 0)
return (ret);
if (0) {
samekey:
if ((ret = __dbc_get(jc->j_workcurs[0],
&jc->j_key, key_n, DB_CURRENT | opmods)) != 0)
return (ret);
F_CLR(jc, JOIN_RETRY);
}
DB_ASSERT(env, F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC |
DB_DBT_USERCOPY) || key_n == key_arg);
if ((F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC |
DB_DBT_USERCOPY)) &&
(ret = __db_retcopy(env,
key_arg, key_n->data, key_n->size, NULL, NULL)) != 0) {
F_SET(jc, JOIN_RETRY);
return (ret);
}
if (operation == DB_JOIN_ITEM)
return (0);
if (!F_ISSET(data_arg, DB_DBT_MALLOC | DB_DBT_REALLOC |
DB_DBT_USERMEM | DB_DBT_USERCOPY))
db_manage_data = 1;
else
db_manage_data = 0;
if ((ret = __db_join_primget(jc->j_primary, dbc->thread_info,
jc->j_curslist[0]->txn, jc->j_curslist[0]->locker, key_n,
db_manage_data ? &jc->j_rdata : data_arg, opmods)) != 0) {
if (ret == DB_NOTFOUND) {
if (LF_ISSET(DB_READ_UNCOMMITTED) ||
(jc->j_curslist[0]->txn != NULL && F_ISSET(
jc->j_curslist[0]->txn, TXN_READ_UNCOMMITTED)))
goto retry;
ret = __db_secondary_corrupt(jc->j_primary);
} else
F_SET(jc, JOIN_RETRY);
}
if (db_manage_data && ret == 0) {
data_arg->data = jc->j_rdata.data;
data_arg->size = jc->j_rdata.size;
}
return (ret);
}
int
__db_join_close(dbc)
DBC *dbc;
{
DB *dbp;
DB_THREAD_INFO *ip;
ENV *env;
JOIN_CURSOR *jc;
int ret, t_ret;
u_int32_t i;
jc = (JOIN_CURSOR *)dbc->internal;
dbp = dbc->dbp;
env = dbp->env;
ret = t_ret = 0;
MUTEX_LOCK(env, dbp->mutex);
TAILQ_REMOVE(&dbp->join_queue, dbc, links);
MUTEX_UNLOCK(env, dbp->mutex);
ENV_ENTER(env, ip);
for (i = 0; i < jc->j_ncurs; i++) {
if (jc->j_workcurs[i] != NULL &&
(t_ret = __dbc_close(jc->j_workcurs[i])) != 0)
ret = t_ret;
if (jc->j_fdupcurs[i] != NULL &&
(t_ret = __dbc_close(jc->j_fdupcurs[i])) != 0)
ret = t_ret;
}
ENV_LEAVE(env, ip);
__os_free(env, jc->j_exhausted);
__os_free(env, jc->j_curslist);
__os_free(env, jc->j_workcurs);
__os_free(env, jc->j_fdupcurs);
__os_free(env, jc->j_key.data);
if (jc->j_rdata.data != NULL)
__os_ufree(env, jc->j_rdata.data);
__os_free(env, jc);
__os_free(env, dbc);
return (ret);
}
static int
__db_join_getnext(dbc, key, data, exhausted, opmods)
DBC *dbc;
DBT *key, *data;
u_int32_t exhausted, opmods;
{
int ret, cmp;
DB *dbp;
DBT ldata;
int (*func) __P((DB *, const DBT *, const DBT *));
dbp = dbc->dbp;
func = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
switch (exhausted) {
case 0:
memset(&ldata, 0, sizeof(DBT));
F_SET(&ldata, DB_DBT_MALLOC);
if ((ret = __dbc_get(dbc,
key, &ldata, opmods | DB_CURRENT)) != 0)
break;
cmp = func(dbp, data, &ldata);
if (cmp == 0) {
if ((ret = __db_retcopy(dbp->env, data, ldata.data,
ldata.size, &data->data, &data->size)) != 0)
return (ret);
__os_ufree(dbp->env, ldata.data);
return (0);
}
__os_ufree(dbp->env, ldata.data);
case 1:
ret = __dbc_get(dbc, key, data, opmods | DB_GET_BOTHC);
break;
default:
ret = EINVAL;
break;
}
return (ret);
}
static int
__db_join_cmp(a, b)
const void *a, *b;
{
DBC *dbca, *dbcb;
db_recno_t counta, countb;
dbca = *((DBC * const *)a);
dbcb = *((DBC * const *)b);
if (__dbc_count(dbca, &counta) != 0 ||
__dbc_count(dbcb, &countb) != 0)
return (0);
return ((long)counta - (long)countb);
}
static int
__db_join_primget(dbp, ip, txn, locker, key, data, flags)
DB *dbp;
DB_THREAD_INFO *ip;
DB_TXN *txn;
DB_LOCKER *locker;
DBT *key, *data;
u_int32_t flags;
{
DBC *dbc;
u_int32_t rmw;
int ret, t_ret;
if ((ret = __db_cursor_int(dbp, ip,
txn, dbp->type, PGNO_INVALID, 0, locker, &dbc)) != 0)
return (ret);
rmw = LF_ISSET(DB_RMW);
if (LF_ISSET(DB_READ_UNCOMMITTED) ||
(txn != NULL && F_ISSET(txn, TXN_READ_UNCOMMITTED)))
F_SET(dbc, DBC_READ_UNCOMMITTED);
if (LF_ISSET(DB_READ_COMMITTED) ||
(txn != NULL && F_ISSET(txn, TXN_READ_COMMITTED)))
F_SET(dbc, DBC_READ_COMMITTED);
LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
DB_ASSERT(dbp->env, flags == 0);
F_SET(dbc, DBC_TRANSIENT);
SET_RET_MEM(dbc, dbp);
ret = __dbc_get(dbc, key, data, DB_SET | rmw);
if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
ret = t_ret;
return (ret);
}
int
__db_secondary_corrupt(dbp)
DB *dbp;
{
__db_err(dbp->env, DB_SECONDARY_BAD, "%s%s%s",
dbp->fname == NULL ? "unnamed" : dbp->fname,
dbp->dname == NULL ? "" : "/",
dbp->dname == NULL ? "" : dbp->dname);
return (DB_SECONDARY_BAD);
}