#define LG_FREE_ALL LAGraph_Free ((void **) &W, NULL) ;
#include "LG_internal.h"
void LG_msort_1b_create_merge_tasks
(
int64_t *LG_RESTRICT L_task, int64_t *LG_RESTRICT L_len, int64_t *LG_RESTRICT R_task, int64_t *LG_RESTRICT R_len, int64_t *LG_RESTRICT S_task, const int t0, const int ntasks, const int64_t pS_start, const int64_t *LG_RESTRICT L_0, const int64_t pL_start,
const int64_t pL_end,
const int64_t *LG_RESTRICT R_0, const int64_t pR_start,
const int64_t pR_end
) ;
static int64_t LG_msort_1b_binary_search (
const int64_t *LG_RESTRICT Y_0, const int64_t pivot,
const int64_t *LG_RESTRICT X_0, const int64_t p_start,
const int64_t p_end
)
{
int64_t pleft = p_start ;
int64_t pright = p_end - 1 ;
while (pleft < pright)
{
int64_t pmiddle = (pleft + pright) >> 1 ;
bool less = LG_lt_1 (X_0, pmiddle,
Y_0, pivot) ;
pleft = less ? (pmiddle+1) : pleft ;
pright = less ? pright : pmiddle ;
}
ASSERT (pleft == pright || pleft == pright + 1) ;
bool found = (pleft == pright) && LG_eq_1 (X_0, pleft,
Y_0, pivot) ;
if (!found && (pleft == pright))
{
if (LG_lt_1 (X_0, pleft,
Y_0, pivot))
{
pleft++ ;
}
else
{
}
}
return (pleft) ;
}
void LG_msort_1b_create_merge_tasks
(
int64_t *LG_RESTRICT L_task, int64_t *LG_RESTRICT L_len, int64_t *LG_RESTRICT R_task, int64_t *LG_RESTRICT R_len, int64_t *LG_RESTRICT S_task, const int t0, const int ntasks, const int64_t pS_start, const int64_t *LG_RESTRICT L_0, const int64_t pL_start,
const int64_t pL_end,
const int64_t *LG_RESTRICT R_0, const int64_t pR_start,
const int64_t pR_end
)
{
int64_t nleft = pL_end - pL_start ; int64_t nright = pR_end - pR_start ; int64_t total_work = nleft + nright ; ASSERT (ntasks >= 1) ;
ASSERT (total_work > 0) ;
if (ntasks == 1)
{
L_task [t0] = pL_start ; L_len [t0] = nleft ;
R_task [t0] = pR_start ; R_len [t0] = nright ;
S_task [t0] = pS_start ;
}
else
{
int64_t pleft, pright ;
if (nleft >= nright)
{
pleft = (pL_end + pL_start) >> 1 ;
pright = LG_msort_1b_binary_search (
L_0, pleft,
R_0, pR_start, pR_end) ;
}
else
{
pright = (pR_end + pR_start) >> 1 ;
pleft = LG_msort_1b_binary_search (
R_0, pright,
L_0, pL_start, pL_end) ;
}
int64_t work0 = (pleft - pL_start) + (pright - pR_start) ;
int ntasks0 = (int) round ((double) ntasks *
(((double) work0) / ((double) total_work))) ;
ntasks0 = LAGRAPH_MAX (ntasks0, 1) ;
ntasks0 = LAGRAPH_MIN (ntasks0, ntasks-1) ;
int ntasks1 = ntasks - ntasks0 ;
LG_msort_1b_create_merge_tasks (
L_task, L_len, R_task, R_len, S_task, t0, ntasks0, pS_start,
L_0, pL_start, pleft,
R_0, pR_start, pright) ;
int t1 = t0 + ntasks0 ; int64_t pS_start1 = pS_start + work0 ; LG_msort_1b_create_merge_tasks (
L_task, L_len, R_task, R_len, S_task, t1, ntasks1, pS_start1,
L_0, pleft, pL_end,
R_0, pright, pR_end) ;
}
}
static void LG_msort_1b_merge
(
int64_t *LG_RESTRICT S_0, const int64_t *LG_RESTRICT Left_0, const int64_t nleft,
const int64_t *LG_RESTRICT Right_0, const int64_t nright
)
{
int64_t p, pleft, pright ;
for (p = 0, pleft = 0, pright = 0 ; pleft < nleft && pright < nright ; p++)
{
if (LG_lt_1 (Left_0, pleft,
Right_0, pright))
{
S_0 [p] = Left_0 [pleft] ;
pleft++ ;
}
else
{
S_0 [p] = Right_0 [pright] ;
pright++ ;
}
}
if (pleft < nleft)
{
int64_t nremaining = (nleft - pleft) ;
memcpy (S_0 + p, Left_0 + pleft, nremaining * sizeof (int64_t)) ;
}
else if (pright < nright)
{
int64_t nremaining = (nright - pright) ;
memcpy (S_0 + p, Right_0 + pright, nremaining * sizeof (int64_t)) ;
}
}
int LG_msort1
(
int64_t *A_0, const int64_t n,
char *msg
)
{
LG_CLEAR_MSG ;
int64_t *LG_RESTRICT W = NULL ;
LG_ASSERT (A_0 != NULL, GrB_NULL_POINTER) ;
int nthreads = LG_nthreads_outer * LG_nthreads_inner ; if (nthreads <= 1 || n <= LG_BASECASE)
{
LG_qsort_1a (A_0, n) ;
return (GrB_SUCCESS) ;
}
int k = (int) (2 + 2 * ceil (log2 ((double) nthreads) / 2)) ;
int ntasks = 1 << k ;
LG_TRY (LAGraph_Malloc ((void **) &W, n + 6*ntasks + 1, sizeof (int64_t), msg)) ;
int64_t *T = W ;
int64_t *LG_RESTRICT W_0 = T ; T += n ;
int64_t *LG_RESTRICT L_task = T ; T += ntasks ;
int64_t *LG_RESTRICT L_len = T ; T += ntasks ;
int64_t *LG_RESTRICT R_task = T ; T += ntasks ;
int64_t *LG_RESTRICT R_len = T ; T += ntasks ;
int64_t *LG_RESTRICT S_task = T ; T += ntasks ;
int64_t *LG_RESTRICT Slice = T ; T += (ntasks+1) ;
LG_eslice (Slice, n, ntasks) ;
int tid ;
#pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
for (tid = 0 ; tid < ntasks ; tid++)
{
int64_t leaf = Slice [tid] ;
int64_t leafsize = Slice [tid+1] - leaf ;
LG_qsort_1a (A_0 + leaf, leafsize) ;
}
int nt = 1 ;
for ( ; k >= 2 ; k -= 2)
{
for (tid = 0 ; tid < ntasks ; tid += 2*nt)
{
LG_msort_1b_create_merge_tasks (
L_task, L_len, R_task, R_len, S_task, tid, 2*nt, Slice [tid],
A_0, Slice [tid], Slice [tid+nt],
A_0, Slice [tid+nt], Slice [tid+2*nt]) ;
}
#pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
for (tid = 0 ; tid < ntasks ; tid++)
{
int64_t pL = L_task [tid], nL = L_len [tid] ;
int64_t pR = R_task [tid], nR = R_len [tid] ;
int64_t pS = S_task [tid] ;
LG_msort_1b_merge (
W_0 + pS,
A_0 + pL, nL,
A_0 + pR, nR) ;
}
nt = 2*nt ;
for (tid = 0 ; tid < ntasks ; tid += 2*nt)
{
LG_msort_1b_create_merge_tasks (
L_task, L_len, R_task, R_len, S_task, tid, 2*nt, Slice [tid],
W_0, Slice [tid], Slice [tid+nt],
W_0, Slice [tid+nt], Slice [tid+2*nt]) ;
}
#pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
for (tid = 0 ; tid < ntasks ; tid++)
{
int64_t pL = L_task [tid], nL = L_len [tid] ;
int64_t pR = R_task [tid], nR = R_len [tid] ;
int64_t pS = S_task [tid] ;
LG_msort_1b_merge (
A_0 + pS,
W_0 + pL, nL,
W_0 + pR, nR) ;
}
nt = 2*nt ;
}
LG_FREE_ALL ;
return (GrB_SUCCESS) ;
}