#include "FAConfig.h"
#include "FAUtils.h"
#include "FASecurity.h"
class SortBytes {
protected:
static int fReverse;
static int fNumeric;
static int fUnique;
private:
typedef struct MERGEDATA
{
int iFile; unsigned char *pwsz; FILE *pFile; } MERGEDATA;
typedef struct SORTDATA
{
char pszTempDir[PATH_MAX]; char * pszTempFileName; unsigned char *pwszCoreLoad; int cwchCoreLoadMax; int cwchCoreLoadMac; int cwchCoreLoadLim; unsigned char **ppwszCorePointers; int cCorePointersMax; int cCorePointersMac;
int cwchStringMac;
MERGEDATA rgMergeData[FOPEN_MAX]; int cOpenFilesMax; int cFilesMac; int cFilesMic; } SORTDATA;
public:
static void Sort(FILE* pInput, int cMegsCoreLoad, const char* pszOutputFile);
static void SetFReverse(int fR);
static int GetFReverse();
static void SetFNumeric(int fN);
static int GetFNumeric();
static void SetFUnique(int fU);
static int GetFUnique();
protected:
static char* MakeSortName(int iFile, SORTDATA *pSortData);
static int KeyCmp(const unsigned char *pwsz1, const unsigned char *pwsz2) {
int cmpValue;
int strlen1;
int strlen2;
int minlen;
if (fNumeric) {
cmpValue = strtol((const char*)pwsz1,NULL,10) - strtol((const char*)pwsz2,NULL,10);
} else {
strlen1 = (int) strlen((const char*)pwsz1);
strlen2 = (int) strlen((const char*)pwsz2);
if (strlen2 < strlen1)
{
minlen = strlen2;
}
else {
minlen = strlen1;
}
cmpValue = memcmp(pwsz1,pwsz2,minlen);
if (0 == cmpValue) {
if (strlen2 < strlen1)
{
cmpValue = 1;
}
else if (strlen1 < strlen2)
{
cmpValue = -1;
}
else
{
}
}
}
if (fReverse) {
cmpValue = -cmpValue;
}
return cmpValue;
}
static int __cdecl MergeCmp(const MERGEDATA *pMergeData1, const MERGEDATA *pMergeData2)
{
return KeyCmp(pMergeData1->pwsz, pMergeData2->pwsz);
}
static int __cdecl SortCmp(const unsigned char **ppwsz1, const unsigned char **ppwsz2)
{
return KeyCmp(*ppwsz1, *ppwsz2);
}
static void SortDump(const char *pszError, SORTDATA *pSortData);
static int SortCoreLoad(FILE *pInput, SORTDATA *pSortData);
static int SortPhase(FILE *pInput, SORTDATA *pSortData);
static int MergeNextSet(const char *pszOutputFileName, SORTDATA *pSortData);
static int MergePhase(const char *pOutputFileName, SORTDATA *pSortData);
static void CountMaxOpenFiles(SORTDATA *pSortData);
static unsigned char* GetLineInUChar(
__out_ecount(cchMaxLen) unsigned char* pStr0,
int cchMaxLen,
FILE *pFile);
static void PutLineInUChar(const unsigned char* pString, FILE *pFile);
};
int SortBytes::fReverse = 0;
int SortBytes::fNumeric = 0;
int SortBytes::fUnique = 0;
void SortBytes::SetFReverse(int fR)
{
fReverse = fR;
}
int SortBytes::GetFReverse()
{
return fReverse;
}
void SortBytes::SetFNumeric(int fN)
{
fNumeric = fN;
}
int SortBytes::GetFNumeric()
{
return fNumeric;
}
void SortBytes::SetFUnique(int fU)
{
fUnique = fU;
}
int SortBytes::GetFUnique()
{
return fUnique;
}
char* SortBytes::MakeSortName(int iFile, SORTDATA *pSortData)
{
#if _MSC_VER < 1400
sprintf (pSortData->pszTempFileName, "%s\\sort%04X.%03X", pSortData->pszTempDir, _getpid(), iFile);
#else
const int TmpBuffSize = (int) strlen (pSortData->pszTempDir) + 14; sprintf_s (pSortData->pszTempFileName, TmpBuffSize, "%s\\sort%04X.%03X", pSortData->pszTempDir, _getpid(), iFile);
#endif
return pSortData->pszTempFileName;
}
void SortBytes::SortDump(const char *pszError, SORTDATA *pSortData)
{
int i;
fprintf(stderr,"%s: cCorePointersMac/Max = %d/%d\n",pszError, pSortData->cCorePointersMac, pSortData->cCorePointersMax);
for (i = 0; i < pSortData->cCorePointersMac; ++i) {
fprintf(stderr,"\t%3d: '%s'\n",i,pSortData->ppwszCorePointers[i]);
}
}
int SortBytes::SortCoreLoad(FILE *pInput, SORTDATA *pSortData)
{
FILE *pOutputFile;
int i, len;
if (pSortData->cwchCoreLoadMac > pSortData->cwchCoreLoadLim) {
pSortData->cwchCoreLoadMac -= pSortData->cwchCoreLoadLim;
memmove(pSortData->pwszCoreLoad,
pSortData->pwszCoreLoad+pSortData->cwchCoreLoadLim,
pSortData->cwchCoreLoadMac*sizeof(unsigned char));
pSortData->cwchCoreLoadLim = 0;
}
pSortData->cwchCoreLoadMac -= pSortData->cwchCoreLoadLim;
pSortData->cwchCoreLoadLim = 0;
pSortData->ppwszCorePointers[0] = pSortData->pwszCoreLoad;
pSortData->cCorePointersMac = 0;
while (pSortData->cwchCoreLoadMax - pSortData->cwchCoreLoadMac > 2 &&
GetLineInUChar(pSortData->pwszCoreLoad + pSortData->cwchCoreLoadMac,
pSortData->cwchCoreLoadMax - pSortData->cwchCoreLoadMac,
pInput)) {
if (pSortData->cCorePointersMac == pSortData->cCorePointersMax) {
pSortData->cCorePointersMax *= 2;
void * const ptr = realloc (pSortData->ppwszCorePointers, \
pSortData->cCorePointersMax*sizeof(unsigned char *));
if (NULL == ptr) {
fprintf(stderr,"fa_sortbytes: out of memory\n");
exit(-1);
}
pSortData->ppwszCorePointers = (unsigned char**) ptr;
}
if (pSortData->cCorePointersMac) {
pSortData->ppwszCorePointers[pSortData->cCorePointersMac] =
pSortData->pwszCoreLoad + pSortData->cwchCoreLoadMac;
}
++pSortData->cCorePointersMac;
while (pSortData->cwchCoreLoadMax > pSortData->cwchCoreLoadMac &&
pSortData->pwszCoreLoad[pSortData->cwchCoreLoadMac]) {
++pSortData->cwchCoreLoadMac;
}
if (pSortData->pwszCoreLoad[pSortData->cwchCoreLoadMac-1] != L'\n') {
--pSortData->cCorePointersMac;
break;
};
pSortData->pwszCoreLoad[pSortData->cwchCoreLoadMac-1] = L'\x00';
pSortData->cwchCoreLoadLim = pSortData->cwchCoreLoadMac;
}
if (!pSortData->cwchCoreLoadLim) {
return false;
}
qsort(pSortData->ppwszCorePointers, pSortData->cCorePointersMac, sizeof(unsigned char *), reinterpret_cast<int(__cdecl *)(const void *, const void *)>(SortBytes::SortCmp));
int res = fopen_s (&pOutputFile, MakeSortName(pSortData->cFilesMac,pSortData),"wb");
if (NULL == pOutputFile || 0 != res) {
fprintf(stderr,"Error opening coreload file %d\n",pSortData->cFilesMac);
exit(-1);
}
pSortData->cFilesMac++;
for (i = 0; i < pSortData->cCorePointersMac; ++i) {
PutLineInUChar(pSortData->ppwszCorePointers[i],pOutputFile);
fwrite("\n", 1, sizeof(char), pOutputFile);
len = (int) strlen((char*)(pSortData->ppwszCorePointers[i]));
if (len > pSortData->cwchStringMac) {
pSortData->cwchStringMac = len;
}
}
fclose(pOutputFile);
return true;
}
int SortBytes::SortPhase(FILE *pInput, SORTDATA *pSortData)
{
pSortData->pwszCoreLoad = (unsigned char*)malloc(pSortData->cwchCoreLoadMax*sizeof(unsigned char));
pSortData->ppwszCorePointers = (unsigned char**)malloc(sizeof(unsigned char *));
pSortData->cCorePointersMax = 1;
if (!pSortData->pwszCoreLoad || !pSortData->ppwszCorePointers) {
fprintf(stderr,"Out of memory in sort initialization\n");
exit(-1);
}
while (SortCoreLoad(pInput,pSortData))
;
free(pSortData->pwszCoreLoad);
free(pSortData->ppwszCorePointers);
return (true);
}
int SortBytes::MergeNextSet(const char *pszOutputFileName, SORTDATA *pSortData)
{
int i, cFiles, lastPass, cFiles0, CmpEntries, fSkip;
MERGEDATA *pMergeData, mergeDataTmp;
unsigned char *pwszCoreLoad;
FILE *pOutputFile;
cFiles = cFiles0 = (pSortData->cOpenFilesMax <= pSortData->cFilesMac - pSortData->cFilesMic) ? \
pSortData->cOpenFilesMax : \
pSortData->cFilesMac - pSortData->cFilesMic ;
lastPass = false;
if (cFiles + pSortData->cFilesMic == pSortData->cFilesMac) {
lastPass = true;
}
pMergeData = pSortData->rgMergeData;
pwszCoreLoad = pSortData->pwszCoreLoad;
for (i = 0; i < cFiles; ++i) {
pMergeData->iFile = pSortData->cFilesMic + i;
pMergeData->pwsz = pwszCoreLoad;
int res = fopen_s (&(pMergeData->pFile), MakeSortName(pMergeData->iFile,pSortData), "rb");
if (NULL == pMergeData->pFile || 0 != res) {
perror("Cannot open a temporary file.");
exit(-1);
}
GetLineInUChar(pMergeData->pwsz,pSortData->cwchStringMac+1,pMergeData->pFile);
++pMergeData;
pwszCoreLoad += pSortData->cwchStringMac+1;
}
int res = 0;
if (lastPass) {
if (pszOutputFileName) {
res = fopen_s (&pOutputFile, pszOutputFileName, "wb");
} else {
pOutputFile = stdout;
}
} else {
res = fopen_s (&pOutputFile, MakeSortName(pSortData->cFilesMac,pSortData), "wb");
}
if (NULL == pOutputFile || 0 != res) {
perror("Can't open merge output file");
exit(-1);
}
qsort(pSortData->rgMergeData,cFiles,sizeof(MERGEDATA),reinterpret_cast<int(__cdecl *)(const void *, const void *)>(SortBytes::MergeCmp));
pMergeData = pSortData->rgMergeData;
fSkip = false;
while (cFiles) {
if (!fSkip)
{
PutLineInUChar(pMergeData->pwsz,pOutputFile);
}
fSkip = false;
if (GetLineInUChar(pMergeData->pwsz,pSortData->cwchStringMac+1,pMergeData->pFile)) {
for (i = 1; i < cFiles; ++i)
{
CmpEntries = MergeCmp(pMergeData,pMergeData+i);
if (0 > CmpEntries) break;
else if (0 == CmpEntries)
{
fSkip = fUnique;
break;
}
}
if (i > 1) {
mergeDataTmp = *pMergeData;
memcpy(pMergeData,pMergeData+1,(i-1)*sizeof(MERGEDATA));
pMergeData[i-1] = mergeDataTmp;
}
} else {
--cFiles;
res = fclose(pMergeData->pFile);
res |= _unlink(MakeSortName(pMergeData->iFile,pSortData));
DebugLogAssert (0 == res);
memcpy(pMergeData,pMergeData+1,cFiles*sizeof(MERGEDATA));
}
}
if (pOutputFile != stdout) {
fclose(pOutputFile);
}
if (lastPass) {
return (false);
}
pSortData->cFilesMic += cFiles0;
++pSortData->cFilesMac;
return (true);
}
int SortBytes::MergePhase(const char *pOutputFileName, SORTDATA *pSortData)
{
pSortData->pwszCoreLoad = (unsigned char*)malloc((pSortData->cwchStringMac+1)*(pSortData->cOpenFilesMax)*sizeof(unsigned char));
if (!pSortData->pwszCoreLoad) {
fprintf(stderr,"Out of memory in merge initialization\n");
exit(-1);
}
while (MergeNextSet(pOutputFileName,pSortData))
;
free(pSortData->pwszCoreLoad);
return (true);
}
void SortBytes::CountMaxOpenFiles(SORTDATA *pSortData)
{
int i;
FILE *rgFile[FOPEN_MAX];
for (i = 0; i < FOPEN_MAX; ++i) {
int res = fopen_s (&(rgFile[i]), MakeSortName(i,pSortData), "w");
if (NULL == rgFile[i] || 0 != res) {
break;
}
}
if (i < 3) {
fprintf(stderr,"Can't open at least 3 files!\n");
exit(-1);
}
pSortData->cOpenFilesMax = i-1;
while (i--) {
int res = fclose(rgFile[i]);
res |= _unlink(MakeSortName(i,pSortData));
DebugLogAssert (0 == res);
}
}
unsigned char* SortBytes::GetLineInUChar(
__out_ecount(cchMaxLen) unsigned char* pStr0,
int cchMaxLen,
FILE *pFile)
{
char ch;
char* pStr;
if (cchMaxLen <= 0)
return NULL;
pStr = (char*)pStr0;
cchMaxLen--; while (cchMaxLen > 0)
{
ch = (char)fgetc(pFile);
if (feof(pFile))
{
if (pStr == (char*)pStr0)
return NULL;
break;
}
switch (ch)
{
case '\r':
break;
default:
*pStr++ = ch;
cchMaxLen--;
break;
}
if (ch == '\n')
break;
}
*pStr = '\0';
return(pStr0);
}
void SortBytes::PutLineInUChar (const unsigned char* pString, FILE *pFile)
{
size_t len = strlen ((char*) pString);
bool fNeedNewLine = false;
if (len && pString [len - 1] == '\n') {
--len;
fNeedNewLine = true;
}
if (len && pString [len - 1] == '\r') {
--len;
fNeedNewLine = true;
}
fwrite (pString, len, sizeof (char), pFile);
if (fNeedNewLine) {
fwrite ("\n", 1, sizeof (char), pFile);
}
}
int
__cdecl
main(int argc, char** argv)
{
FILE *pInput;
int i;
int fError = 0;
const char *pszOutputFile;
int cMegsCoreLoad = 1; int fNumeric = 0;
int fReverse = 0;
int fUnique = 0;
::FAIOSetup ();
while (argc > 1) {
if (*argv[1] != '-' && *argv[1] != '/') {
break;
}
for (i = 1; argv[1][i] != '\0'; ++i) {
switch (argv[1][i]) {
case 'n': fNumeric = 1;
break;
case 'r': fReverse = 1;
break;
case 'u': fUnique = 1;
break;
case 'm': cMegsCoreLoad = 250;
break;
default:
fError = 1;
}
}
--argc;
++argv;
}
if (argc > 3) {
fError = 1;
}
if (fError) {
fprintf(stderr,"Usage: fa_sortbytes <-nrm> <input-file <output-file>>\n");
fprintf(stderr," -n sort in numeric order\n");
fprintf(stderr," -r sort in reverse order\n");
fprintf(stderr," -u remove duplicate entries - each resulting entry will be unique\n");
fprintf(stderr," (not implemented yet)\n");
fprintf(stderr," -m use ~512 MB of RAM during sort phase (default ~2)\n");
exit(-1);
}
pInput = stdin;
int res = 0;
if (argc > 1) {
res = fopen_s (&pInput, argv[1], "rb");
}
if (NULL == pInput || 0 != res) {
perror("Cannot open the input file.");
exit(-1);
}
pszOutputFile = argc > 2 ? argv[2] : NULL;
::FAInputIOSetup ();
SortBytes::SetFReverse(fReverse);
SortBytes::SetFNumeric(fNumeric);
SortBytes::SetFNumeric(fUnique);
SortBytes::Sort(pInput,cMegsCoreLoad,pszOutputFile);
exit (0);
}
void SortBytes::Sort(FILE* pInput, int cMegsCoreLoad, const char* pszOutputFile)
{
SORTDATA sortData;
int cchFileNames;
memset(&sortData,0,sizeof(sortData));
sortData.cwchCoreLoadMax = cMegsCoreLoad * 1024*1024;
sortData.pszTempDir [0] = '.';
sortData.pszTempDir [1] = 0;
cchFileNames = (int) strlen(sortData.pszTempDir) + 14;
sortData.pszTempFileName = (char*)malloc(cchFileNames*sizeof(char));
CountMaxOpenFiles(&sortData);
SortPhase(pInput, &sortData);
if (pInput != stdin) {
fclose(pInput);
}
MergePhase(pszOutputFile,&sortData);
}