panthema / 2013 / parallel-string-sorting / parallel-string-sorting-0.6.5 / src / sinha-copy-burstsort / glue.cc (Download File)
/******************************************************************************
 * src/sinha-copy-burstsort/glue.cc
 *
 * Glue to attach Sinha's C-Burstsort code to parallel-string-sort framework.
 *
 ******************************************************************************
 * Copyright (C) 2012-2013 Timo Bingmann <tb@panthema.net>
 *
 * This program is free software: you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation, either version 3 of the License, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program.  If not, see <http://www.gnu.org/licenses/>.
 *****************************************************************************/

#include "../tools/contest.h"

#include <time.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <float.h>
#include <limits.h>

extern const char* g_string_data;
extern size_t g_string_datasize;

namespace sinha_copy_burstsort {

#include "C-burstsort.c"
#include "CP-burstsort.c"
#include "CPL-burstsort.c"
#include "utils.c"
//#include "copy-burstsort.c"

void getsegs(string) {}
void getkeys(string) {}
void listinputs() {}

void sinha_C_burstsort_prepare(string *strings, size_t size)
{
    char CHARCOUNT[256] = { 0 };
    MAXKEYLEN = 0;

    for (size_t i = 0; i < size; ++i)
    {
        size_t j;
        for (j = 0; strings[i][j]; ++j)
        {
            CHARCOUNT[ strings[i][j] ] = 1;
        }
        if (j > MAXKEYLEN) MAXKEYLEN = j;
    }
    LOCHAR = 1; while (!CHARCOUNT[LOCHAR]) ++LOCHAR;
    HICHAR = 255; while (!CHARCOUNT[HICHAR]) --HICHAR;
}

void sinha_C_burstsort(string *strings, size_t size)
{
    /* set default values for command line args. */
    CACHESIZE = 1<<19;
    BINSIZE0 = 1<<9;		/* initial size of buffer in terminal node */
    while (BINSIZE0 < MAXKEYLEN) BINSIZE0 *= 2; /* must fix at least one key */

    SAMPLERATE = 0;		/* if >0, will sample 1/SAMPLERATE of data */
    FREEBURSTS = 0;		/* number of low threshold bursts to allow */

    NKEYS = size;
    NBYTES = g_string_datasize;
    LIMSIZE0 = BINSIZE0 - MAXKEYLEN;

    /* Zero counters for burst trie objects. */
    NODES = BINS = NULLBINS = 0;

    /* Run C-burstsort on the data that was loaded in getsegs() above. */

    /* Create root node of burst trie. */
    NODE1 *rt = GETNODE(NODE1); ++NODES;

    /* Insert a group of keys; seg[] and seglim[] are set up so that
       segments begin and end at key boundaries. */
    sadd((string)g_string_data, (string)g_string_data + g_string_datasize, rt);

    /* Discard any remaining free bursts; see below in straverse() for why. */
    FREEBURSTS = 0;

    /* Starting at the root node, traverse the trie and sort each bin. */
    straverse(rt);

    /* Write out string characters and string pointers to psstest */
    tb_ssavetrie(rt, strings, (string)g_string_data);

    /* Deallocate the root node. */
    skill(rt);
}

void sinha_fbC_burstsort(unsigned char **strings, size_t size)
{
    /* set default values for command line args. */
    CACHESIZE = 1<<19;
    BINSIZE0 = 1<<9;		/* initial size of buffer in terminal node */
    while (BINSIZE0 < MAXKEYLEN) BINSIZE0 *= 2; /* must fix at least one key */

    SAMPLERATE = 0;		/* if >0, will sample 1/SAMPLERATE of data */
    FREEBURSTS = 100;		/* number of low threshold bursts to allow */

    NKEYS = size;
    NBYTES = g_string_datasize;
    LIMSIZE0 = BINSIZE0 - MAXKEYLEN;

    /* Zero counters for burst trie objects. */
    NODES = BINS = NULLBINS = 0;

    /* Run C-burstsort on the data that was loaded in getsegs() above. */

    /* Create root node of burst trie. */
    NODE1 *rt = GETNODE(NODE1); ++NODES;

    /* Insert a group of keys; seg[] and seglim[] are set up so that
       segments begin and end at key boundaries. */
    sadd((string)g_string_data, (string)g_string_data + g_string_datasize, rt);

    /* Discard any remaining free bursts; see below in straverse() for why. */
    FREEBURSTS = 0;

    /* Starting at the root node, traverse the trie and sort each bin. */
    straverse(rt);

    /* Write out string characters and string pointers to psstest */
    tb_ssavetrie(rt, strings, (string)g_string_data);

    /* Deallocate the root node. */
    skill(rt);
}

void sinha_sC_burstsort(unsigned char **strings, size_t size)
{
    /* set default values for command line args. */
    CACHESIZE = 1<<19;
    BINSIZE0 = 1<<9;		/* initial size of buffer in terminal node */
    while (BINSIZE0 < MAXKEYLEN) BINSIZE0 *= 2; /* must fix at least one key */

    SAMPLERATE = 4000;		/* if >0, will sample 1/SAMPLERATE of data */
    FREEBURSTS = 0;		/* number of low threshold bursts to allow */

    NKEYS = size;
    NBYTES = g_string_datasize;
    NSEGS = 1;
    string mySEGMENTS = { (string)g_string_data };
    string mySEGLIMITS = { (string)g_string_data + g_string_datasize };

    LIMSIZE0 = BINSIZE0 - MAXKEYLEN;

    /* Zero counters for burst trie objects. */
    NODES = BINS = NULLBINS = 0;

    /* Run C-burstsort on the data that was loaded in getsegs() above. */

    /* Create root node of burst trie. */
    NODE1 *rt = GETNODE(NODE1); ++NODES;

    /* Sampling mode prebuilds a trie based on a random sample of keys. */
    if (SAMPLERATE)
    {
        /* Build trie using 1/SAMPLERATE of the keys and bursting any bin
           that fills at BINSIZE0 rather than the usual threshold. */
        ssample(&mySEGMENTS, &mySEGLIMITS, rt, NKEYS/SAMPLERATE);

        /* Clear data but retain node/bin structure of the "skeleton trie". */
        clear(rt);
    }

    /* Insert a group of keys; seg[] and seglim[] are set up so that
       segments begin and end at key boundaries. */
    sadd((string)g_string_data, (string)g_string_data + g_string_datasize, rt);

    /* Discard any remaining free bursts; see below in straverse() for why. */
    FREEBURSTS = 0;

    /* Starting at the root node, traverse the trie and sort each bin. */
    straverse(rt);

    /* Write out string characters and string pointers to psstest */
    tb_ssavetrie(rt, strings, (string)g_string_data);

    /* Deallocate the root node. */
    skill(rt);
}

CONTESTANT_REGISTER_PREPARE(sinha_C_burstsort_prepare, sinha_C_burstsort,
                            "sinha/C_burstsort", "Original C-burstsort");

CONTESTANT_REGISTER_PREPARE(sinha_C_burstsort_prepare, sinha_fbC_burstsort,
                            "sinha/fbC_burstsort", "Original fbC-burstsort");

CONTESTANT_REGISTER_PREPARE(sinha_C_burstsort_prepare, sinha_sC_burstsort,
                            "sinha/sC_burstsort", "Original sC-burstsort");

void sinha_CP_burstsort(unsigned char **strings, size_t size)
{
    /* set default values for command line args. */
    CACHESIZE = 1<<19;
    BINSIZE0 = 1<<9;		/* initial size of buffer in terminal node */
    while (BINSIZE0 < MAXKEYLEN) BINSIZE0 *= 2; /* must fix at least one key */

    SAMPLERATE = 0;		/* if >0, will sample 1/SAMPLERATE of data */
    FREEBURSTS = 0;		/* number of low threshold bursts to allow */

    NKEYS = size;
    NBYTES = g_string_datasize;
    LIMSIZE0 = BINSIZE0 - MAXKEYLEN;

    /* Zero counters for burst trie objects. */
    NODES = BINS = NULLBINS = 0;

    /* Create root node of burst trie. */
    NODE1 *rt = GETNODE(NODE1); ++NODES;

    /* Read string sequence. */
    radd((string)g_string_data, NKEYS, rt);

    /* Discard any remaining free bursts; see below in rtraverse() for why. */
    FREEBURSTS = 0;

    /* Starting at the root node, traverse the trie and sort each bin. */
    rtraverse(rt);

    /* Write out string characters and string pointers to psstest */
    tb_savetrie(rt, (string*)strings);

    /* Deallocate the root node. */
    rkill(rt);
}

void sinha_fbCP_burstsort(unsigned char **strings, size_t size)
{
    /* set default values for command line args. */
    CACHESIZE = 1<<19;
    BINSIZE0 = 1<<9;		/* initial size of buffer in terminal node */
    while (BINSIZE0 < MAXKEYLEN) BINSIZE0 *= 2; /* must fix at least one key */

    SAMPLERATE = 0;		/* if >0, will sample 1/SAMPLERATE of data */
    FREEBURSTS = 100;		/* number of low threshold bursts to allow */

    NKEYS = size;
    NBYTES = g_string_datasize;
    LIMSIZE0 = BINSIZE0 - MAXKEYLEN;

    /* Zero counters for burst trie objects. */
    NODES = BINS = NULLBINS = 0;

    /* Create root node of burst trie. */
    NODE1 *rt = GETNODE(NODE1); ++NODES;

    /* Read string sequence. */
    radd((string)g_string_data, NKEYS, rt);

    /* Discard any remaining free bursts; see below in rtraverse() for why. */
    FREEBURSTS = 0;

    /* Starting at the root node, traverse the trie and sort each bin. */
    rtraverse(rt);

    /* Write out string characters and string pointers to psstest */
    tb_savetrie(rt, (string*)strings);

    /* Deallocate the root node. */
    rkill(rt);
}

void sinha_sCP_burstsort(unsigned char **strings, size_t size)
{
    /* set default values for command line args. */
    CACHESIZE = 1<<19;
    BINSIZE0 = 1<<9;		/* initial size of buffer in terminal node */
    while (BINSIZE0 < MAXKEYLEN) BINSIZE0 *= 2; /* must fix at least one key */

    SAMPLERATE = 4000;		/* if >0, will sample 1/SAMPLERATE of data */
    FREEBURSTS = 0;		/* number of low threshold bursts to allow */

    NKEYS = size;
    NBYTES = g_string_datasize;
    LIMSIZE0 = BINSIZE0 - MAXKEYLEN;

    /* Zero counters for burst trie objects. */
    NODES = BINS = NULLBINS = 0;

    /* Create root node of burst trie. */
    NODE1 *rt = GETNODE(NODE1); ++NODES;

    if (SAMPLERATE)
    {
        rsample((string)g_string_data, rt, NKEYS / SAMPLERATE);
        clear(rt);
    }

    /* Read string sequence. */
    radd((string)g_string_data, NKEYS, rt);

    /* Discard any remaining free bursts; see below in rtraverse() for why. */
    FREEBURSTS = 0;

    /* Starting at the root node, traverse the trie and sort each bin. */
    rtraverse(rt);

    /* Write out string characters and string pointers to psstest */
    tb_savetrie(rt, (string*)strings);

    /* Deallocate the root node. */
    rkill(rt);
}

CONTESTANT_REGISTER_PREPARE(sinha_C_burstsort_prepare, sinha_CP_burstsort,
                            "sinha/CP_burstsort", "Original CP-burstsort");

CONTESTANT_REGISTER_PREPARE(sinha_C_burstsort_prepare, sinha_fbCP_burstsort,
                            "sinha/fbCP_burstsort", "Original fbCP-burstsort");

CONTESTANT_REGISTER_PREPARE(sinha_C_burstsort_prepare, sinha_sCP_burstsort,
                            "sinha/sCP_burstsort", "Original sCP-burstsort");

/* Initialization from pbdo() */
void sinha_CPL_burstsort_initialize()
{
    /* The command line arg TAILRATE specifies a
       percentage of the average key length to use as
       the key segment size; we calculate this now. */
    double d = TAILRATE; d /= 100;
    d *= NBYTES; d /= NKEYS;

    /* Command line arg TAILSIZE0 specifies a maximum
       key segment size; we choose the smaller of
       TAILSIZE0 and TAILSIZE calculated above. */
    TAILSIZE = MIN(d, TAILSIZE0);

    /* Both TAILSIZE and TAILSIZE + 4 are used in many
       operations, so we precalculate both; the extra
       four bytes is room for a pointer to the original
       record, which is kept immediately before the
       TAILSIZE bytes of suffix. */
    TAILSIZE4 = TAILSIZE + sizeof(string);

    /* In contrast to C- and CP- burstsort, the key
       segments inserted in bins during CPL-burstsort
       are of known fixed length and we can calculate
       ahead of time how many will fit for each bin
       size. */
    int	lv = 0; int sz = BINSIZE0;
    while (sz < CACHESIZE)
    {
        THR[lv] = sz / TAILSIZE4;
        sz *= 2;
        ++lv;
    }

    /* For the preceding levels, we allowed room for
       TAILSIZE bytes of each key plus a record pointer;
       when the bin reaches cache size, we also need to
       reserve room for the sorting pointerss that will
       be needed during container sorting.*/
    THR[lv] = CACHESIZE / (TAILSIZE + 8);
}

void sinha_CPL_burstsort(unsigned char **strings, size_t size)
{
    /* set default values for command line args. */
    CACHESIZE = 1<<19;
    BINSIZE0 = 1<<9;		/* initial size of buffer in terminal node */
    while (BINSIZE0 < MAXKEYLEN) BINSIZE0 *= 2; /* must fix at least one key */

    SAMPLERATE = 0;		/* if >0, will sample 1/SAMPLERATE of data */
    FREEBURSTS = 0;		/* number of low threshold bursts to allow */
    TAILSIZE0 = 12;             /* maximum tail size in CPL-burstsort */
    TAILRATE = 80;		/* used with TAILSIZE0 in CPL-burstsort */

    NKEYS = size;
    NBYTES = g_string_datasize;
    LIMSIZE0 = BINSIZE0 - MAXKEYLEN;

    sinha_CPL_burstsort_initialize();

    /* Zero counters for burst trie objects. */
    NODES = BINS = NULLBINS = 0;

    /* Create root node of burst trie. */
    NODE2 *rt = GETNODE(NODE2); ++NODES;

    /* Read string sequence. */
    padd((string)g_string_data, NKEYS, rt);

    /* Discard any remaining free bursts; see below in rtraverse() for why. */
    FREEBURSTS = 0;

    /* Starting at the root node, traverse the trie and sort each bin. */
    ptraverse(rt);

    /* Write out string characters and string pointers to psstest */
    tb_psavetrie(rt, (string*)strings);

    /* Deallocate the root node. */
    pkill(rt);
}

void sinha_fbCPL_burstsort(unsigned char **strings, size_t size)
{
    /* set default values for command line args. */
    CACHESIZE = 1<<19;
    BINSIZE0 = 1<<9;		/* initial size of buffer in terminal node */
    while (BINSIZE0 < MAXKEYLEN) BINSIZE0 *= 2; /* must fix at least one key */

    SAMPLERATE = 0;		/* if >0, will sample 1/SAMPLERATE of data */
    FREEBURSTS = 100;		/* number of low threshold bursts to allow */
    TAILSIZE0 = 12;             /* maximum tail size in CPL-burstsort */
    TAILRATE = 80;		/* used with TAILSIZE0 in CPL-burstsort */

    NKEYS = size;
    NBYTES = g_string_datasize;
    LIMSIZE0 = BINSIZE0 - MAXKEYLEN;

    sinha_CPL_burstsort_initialize();

    /* Zero counters for burst trie objects. */
    NODES = BINS = NULLBINS = 0;

    /* Create root node of burst trie. */
    NODE2 *rt = GETNODE(NODE2); ++NODES;

    /* Read string sequence. */
    padd((string)g_string_data, NKEYS, rt);

    /* Discard any remaining free bursts; see below in rtraverse() for why. */
    FREEBURSTS = 0;

    /* Starting at the root node, traverse the trie and sort each bin. */
    ptraverse(rt);

    /* Write out string characters and string pointers to psstest */
    tb_psavetrie(rt, (string*)strings);

    /* Deallocate the root node. */
    pkill(rt);
}

void sinha_sCPL_burstsort(unsigned char **strings, size_t size)
{
    /* set default values for command line args. */
    CACHESIZE = 1<<19;
    BINSIZE0 = 1<<9;		/* initial size of buffer in terminal node */
    while (BINSIZE0 < MAXKEYLEN) BINSIZE0 *= 2; /* must fix at least one key */

    SAMPLERATE = 4000;		/* if >0, will sample 1/SAMPLERATE of data */
    FREEBURSTS = 0;		/* number of low threshold bursts to allow */
    TAILSIZE0 = 12;             /* maximum tail size in CPL-burstsort */
    TAILRATE = 80;		/* used with TAILSIZE0 in CPL-burstsort */

    NKEYS = size;
    NBYTES = g_string_datasize;
    LIMSIZE0 = BINSIZE0 - MAXKEYLEN;

    sinha_CPL_burstsort_initialize();

    /* Zero counters for burst trie objects. */
    NODES = BINS = NULLBINS = 0;

    /* Create root node of burst trie. */
    NODE2 *rt = GETNODE(NODE2); ++NODES;

    if (SAMPLERATE)
    {
        psample((string)g_string_data, rt, NKEYS / SAMPLERATE);
        pclear(rt);
    }

    /* Read string sequence. */
    padd((string)g_string_data, NKEYS, rt);

    /* Discard any remaining free bursts; see below in rtraverse() for why. */
    FREEBURSTS = 0;

    /* Starting at the root node, traverse the trie and sort each bin. */
    ptraverse(rt);

    /* Write out string characters and string pointers to psstest */
    tb_psavetrie(rt, (string*)strings);

    /* Deallocate the root node. */
    pkill(rt);
}

CONTESTANT_REGISTER_PREPARE(sinha_C_burstsort_prepare, sinha_CPL_burstsort,
                            "sinha/CPL_burstsort", "Original CPL-burstsort");

CONTESTANT_REGISTER_PREPARE(sinha_C_burstsort_prepare, sinha_fbCPL_burstsort,
                            "sinha/fbCPL_burstsort", "Original fbCPL-burstsort");

CONTESTANT_REGISTER_PREPARE(sinha_C_burstsort_prepare, sinha_sCPL_burstsort,
                            "sinha/sCPL_burstsort", "Original sCPL-burstsort");

}