diff --git a/src/Makefile.am b/src/Makefile.am index 4202a4f..facc3d8 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -8,7 +8,8 @@ libcmph_la_SOURCES = hash.c jenkins_hash.c\ buffer_manager.c buffer_entry.c\ brz.c fch.c fch_buckets.c \ select.c compressed_seq.c \ - chd_ph.c miller_rabin.c + chd.c chd_ph.c miller_rabin.c \ + compressed_rank.c libcmph_la_LDFLAGS = -version-info 0:0:0 diff --git a/src/bdz.c b/src/bdz.c index 03c9dcc..ac833ab 100755 --- a/src/bdz.c +++ b/src/bdz.c @@ -269,6 +269,13 @@ cmph_t *bdz_new(cmph_config_t *mph, double c) bdz_queue_t edges; bdz_graph3_t graph3; bdz_config_data_t *bdz = (bdz_config_data_t *)mph->data; + #ifdef CMPH_TIMING + double construction_time_begin = 0.0; + double construction_time = 0.0; + ELAPSED_TIME_IN_SECONDS(&construction_time_begin); + #endif + + if (c == 0) c = 1.23; // validating restrictions over parameter c. DEBUGP("c: %f\n", c); bdz->m = mph->key_source->nkeys; @@ -338,7 +345,9 @@ cmph_t *bdz_new(cmph_config_t *mph, double c) fprintf(stderr, "Entering ranking step for mph creation of %u keys with graph sized %u\n", bdz->m, bdz->n); } ranking(bdz); - + #ifdef CMPH_TIMING + ELAPSED_TIME_IN_SECONDS(&construction_time); + #endif mphf = (cmph_t *)malloc(sizeof(cmph_t)); mphf->algo = mph->algo; bdzf = (bdz_data_t *)malloc(sizeof(bdz_data_t)); @@ -363,6 +372,14 @@ cmph_t *bdz_new(cmph_config_t *mph, double c) fprintf(stderr, "Successfully generated minimal perfect hash function\n"); } + + #ifdef CMPH_TIMING + register cmph_uint32 space_usage = bdz_packed_size(mphf)*8; + register cmph_uint32 keys_per_bucket = 1; + construction_time = construction_time - construction_time_begin; + fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", bdz->m, bdz->m/(double)bdz->n, keys_per_bucket, construction_time, space_usage/(double)bdz->m); + #endif + return mphf; } diff --git a/src/bdz_ph.c b/src/bdz_ph.c index 933ba85..a3cc3cc 100755 --- a/src/bdz_ph.c +++ b/src/bdz_ph.c @@ -242,6 +242,12 @@ cmph_t *bdz_ph_new(cmph_config_t *mph, double c) bdz_ph_queue_t edges; bdz_ph_graph3_t graph3; bdz_ph_config_data_t *bdz_ph = (bdz_ph_config_data_t *)mph->data; + #ifdef CMPH_TIMING + double construction_time_begin = 0.0; + double construction_time = 0.0; + ELAPSED_TIME_IN_SECONDS(&construction_time_begin); + #endif + if (c == 0) c = 1.23; // validating restrictions over parameter c. DEBUGP("c: %f\n", c); @@ -309,6 +315,9 @@ cmph_t *bdz_ph_new(cmph_config_t *mph, double c) bdz_ph_optimization(bdz_ph); + #ifdef CMPH_TIMING + ELAPSED_TIME_IN_SECONDS(&construction_time); + #endif mphf = (cmph_t *)malloc(sizeof(cmph_t)); mphf->algo = mph->algo; bdz_phf = (bdz_ph_data_t *)malloc(sizeof(bdz_ph_data_t)); @@ -328,6 +337,13 @@ cmph_t *bdz_ph_new(cmph_config_t *mph, double c) fprintf(stderr, "Successfully generated minimal perfect hash function\n"); } + #ifdef CMPH_TIMING + register cmph_uint32 space_usage = bdz_ph_packed_size(mphf)*8; + register cmph_uint32 keys_per_bucket = 1; + construction_time = construction_time - construction_time_begin; + fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", bdz_ph->m, bdz_ph->m/(double)bdz_ph->n, keys_per_bucket, construction_time, space_usage/(double)bdz_ph->m); + #endif + return mphf; } diff --git a/src/bmz.c b/src/bmz.c index 2d71e7a..e8b6cf2 100644 --- a/src/bmz.c +++ b/src/bmz.c @@ -615,8 +615,6 @@ cmph_uint32 bmz_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n; register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n; - DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); if (h1 == h2 && ++h2 > n) h2 = 0; - DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, g_ptr[h1], g_ptr[h2], m); return (g_ptr[h1] + g_ptr[h2]); } diff --git a/src/bmz8.c b/src/bmz8.c index 02117b2..dea5de4 100644 --- a/src/bmz8.c +++ b/src/bmz8.c @@ -479,11 +479,11 @@ int bmz8_dump(cmph_t *mphf, FILE *fd) nbytes = fwrite(&(data->m), sizeof(cmph_uint8), (size_t)1, fd); nbytes = fwrite(data->g, sizeof(cmph_uint8)*(data->n), (size_t)1, fd); - #ifdef DEBUG +/* #ifdef DEBUG fprintf(stderr, "G: "); for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]); fprintf(stderr, "\n"); - #endif + #endif*/ return 1; } @@ -625,6 +625,5 @@ cmph_uint8 bmz8_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke register cmph_uint8 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n; DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); if (h1 == h2 && ++h2 > n) h2 = 0; - DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, g_ptr[h1], g_ptr[h2], m); return (g_ptr[h1] + g_ptr[h2]); } diff --git a/src/brz.c b/src/brz.c index 479630a..58067f1 100755 --- a/src/brz.c +++ b/src/brz.c @@ -904,7 +904,6 @@ static cmph_uint32 brz_bmz8_search_packed(cmph_uint32 *packed_mphf, const char * if (h1 == h2 && ++h2 >= n) h2 = 0; mphf_bucket = g[h1] + g[h2]; DEBUGP("key: %s h1: %u h2: %u h0: %u\n", key, h1, h2, h0); - DEBUGP("key: %s g[h1]: %u g[h2]: %u offset[h0]: %u edges: %u\n", key, g[h1], g[h2], >offset[h0], m); DEBUGP("Address: %u\n", mphf_bucket + offset[h0]); return (mphf_bucket + offset[h0]); } diff --git a/src/chd.c b/src/chd.c new file mode 100644 index 0000000..72523c0 --- /dev/null +++ b/src/chd.c @@ -0,0 +1,271 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "cmph_structs.h" +#include "chd_structs.h" +#include "chd.h" + +//#define DEBUG +#include "debug.h" + +chd_config_data_t *chd_config_new(cmph_config_t *mph) +{ + cmph_io_adapter_t *key_source = mph->key_source; + chd_config_data_t *chd; + chd = (chd_config_data_t *)malloc(sizeof(chd_config_data_t)); + assert(chd); + memset(chd, 0, sizeof(chd_config_data_t)); + + chd->chd_ph = cmph_config_new(key_source); + cmph_config_set_algo(chd->chd_ph, CMPH_CHD_PH); + + return chd; +} + +void chd_config_destroy(cmph_config_t *mph) +{ + chd_config_data_t *data = (chd_config_data_t *) mph->data; + DEBUGP("Destroying algorithm dependent data\n"); + if(data->chd_ph) + { + cmph_config_destroy(data->chd_ph); + data->chd_ph = NULL; + } + free(data); +} + + +void chd_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) +{ + chd_config_data_t *data = (chd_config_data_t *) mph->data; + cmph_config_set_hashfuncs(data->chd_ph, hashfuncs); +} + + +void chd_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket) +{ + chd_config_data_t *data = (chd_config_data_t *) mph->data; + cmph_config_set_b(data->chd_ph, keys_per_bucket); +} + + +void chd_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin) +{ + chd_config_data_t *data = (chd_config_data_t *) mph->data; + cmph_config_set_keys_per_bin(data->chd_ph, keys_per_bin); +} + + +cmph_t *chd_new(cmph_config_t *mph, double c) +{ + cmph_t *mphf = NULL; + chd_data_t *chdf = NULL; + chd_config_data_t *chd = (chd_config_data_t *)mph->data; + chd_ph_config_data_t * chd_ph = (chd_ph_config_data_t *)chd->chd_ph->data; + compressed_rank_t cr; + + register cmph_t * chd_phf = NULL; + register cmph_uint32 packed_chd_phf_size = 0; + cmph_uint8 * packed_chd_phf = NULL; + + register cmph_uint32 packed_cr_size = 0; + cmph_uint8 * packed_cr = NULL; + + register cmph_uint32 i, idx, nkeys, nvals, nbins; + cmph_uint32 * vals_table = NULL; + register cmph_uint8 * occup_table = NULL; + #ifdef CMPH_TIMING + double construction_time_begin = 0.0; + double construction_time = 0.0; + ELAPSED_TIME_IN_SECONDS(&construction_time_begin); + #endif + + cmph_config_set_verbosity(chd->chd_ph, mph->verbosity); + cmph_config_set_graphsize(chd->chd_ph, c); + + if (mph->verbosity) + { + fprintf(stderr, "Generating a CHD_PH perfect hash function with a load factor equal to %.3f\n", c); + } + + chd_phf = cmph_new(chd->chd_ph); + + if(chd_phf == NULL) + { + return NULL; + } + + packed_chd_phf_size = cmph_packed_size(chd_phf); + DEBUGP("packed_chd_phf_size = %u\n", packed_chd_phf_size); + + /* Make sure that we have enough space to pack the mphf. */ + packed_chd_phf = calloc((size_t)packed_chd_phf_size,(size_t)1); + + /* Pack the mphf. */ + cmph_pack(chd_phf, packed_chd_phf); + + cmph_destroy(chd_phf); + + + if (mph->verbosity) + { + fprintf(stderr, "Compressing the range of the resulting CHD_PH perfect hash function\n"); + } + + compressed_rank_init(&cr); + nbins = chd_ph->n; + nkeys = chd_ph->m; + nvals = nbins - nkeys; + + vals_table = (cmph_uint32 *)calloc(nvals, sizeof(cmph_uint32)); + occup_table = chd_ph->occup_table; + + for(i = 0, idx = 0; i < nbins; i++) + { + if(occup_table[i] == 0) + { + vals_table[idx++] = i; + } + } + + compressed_rank_generate(&cr, vals_table, nvals); + free(vals_table); + + packed_cr_size = compressed_rank_packed_size(&cr); + packed_cr = (cmph_uint8 *) calloc(packed_cr_size, sizeof(cmph_uint8)); + compressed_rank_pack(&cr, packed_cr); + compressed_rank_destroy(&cr); + + mphf = (cmph_t *)malloc(sizeof(cmph_t)); + mphf->algo = mph->algo; + chdf = (chd_data_t *)malloc(sizeof(chd_data_t)); + + chdf->packed_cr = packed_cr; + packed_cr = NULL; //transfer memory ownership + + chdf->packed_chd_phf = packed_chd_phf; + packed_chd_phf = NULL; //transfer memory ownership + + chdf->packed_chd_phf_size = packed_chd_phf_size; + chdf->packed_cr_size = packed_cr_size; + + mphf->data = chdf; + mphf->size = nkeys; + + DEBUGP("Successfully generated minimal perfect hash\n"); + if (mph->verbosity) + { + fprintf(stderr, "Successfully generated minimal perfect hash function\n"); + } + #ifdef CMPH_TIMING + ELAPSED_TIME_IN_SECONDS(&construction_time); + register cmph_uint32 space_usage = chd_packed_size(mphf)*8; + construction_time = construction_time - construction_time_begin; + fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", nkeys, c, chd_ph->keys_per_bucket, construction_time, space_usage/(double)nkeys); + #endif + + return mphf; +} + +void chd_load(FILE *fd, cmph_t *mphf) +{ + register cmph_uint32 nbytes; + chd_data_t *chd = (chd_data_t *)malloc(sizeof(chd_data_t)); + + DEBUGP("Loading chd mphf\n"); + mphf->data = chd; + + nbytes = fread(&chd->packed_chd_phf_size, sizeof(cmph_uint32), (size_t)1, fd); + DEBUGP("Loading CHD_PH perfect hash function with %u bytes to disk\n", chd->packed_chd_phf_size); + chd->packed_chd_phf = (cmph_uint8 *) calloc((size_t)chd->packed_chd_phf_size,(size_t)1); + nbytes = fread(chd->packed_chd_phf, chd->packed_chd_phf_size, (size_t)1, fd); + + nbytes = fread(&chd->packed_cr_size, sizeof(cmph_uint32), (size_t)1, fd); + DEBUGP("Loading Compressed rank structure, which has %u bytes\n", chd->packed_cr_size); + chd->packed_cr = (cmph_uint8 *) calloc((size_t)chd->packed_cr_size, (size_t)1); + nbytes = fread(chd->packed_cr, chd->packed_cr_size, (size_t)1, fd); +} + +int chd_dump(cmph_t *mphf, FILE *fd) +{ + register cmph_uint32 nbytes; + chd_data_t *data = (chd_data_t *)mphf->data; + + __cmph_dump(mphf, fd); + // Dumping CHD_PH perfect hash function + + DEBUGP("Dumping CHD_PH perfect hash function with %u bytes to disk\n", data->packed_chd_phf_size); + nbytes = fwrite(&data->packed_chd_phf_size, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(data->packed_chd_phf, data->packed_chd_phf_size, (size_t)1, fd); + + DEBUGP("Dumping compressed rank structure with %u bytes to disk\n", buflen); + nbytes = fwrite(&data->packed_cr_size, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(data->packed_cr, data->packed_cr_size, (size_t)1, fd); + + return 1; +} + +void chd_destroy(cmph_t *mphf) +{ + chd_data_t *data = (chd_data_t *)mphf->data; + free(data->packed_chd_phf); + free(data->packed_cr); + free(data); + free(mphf); +} + +static inline cmph_uint32 _chd_search(void * packed_chd_phf, void * packed_cr, const char *key, cmph_uint32 keylen) +{ + register cmph_uint32 bin_idx = cmph_search_packed(packed_chd_phf, key, keylen); + register cmph_uint32 rank = compressed_rank_query_packed(packed_cr, bin_idx); + return bin_idx - rank; +} + +cmph_uint32 chd_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) +{ + register chd_data_t * chd = mphf->data; + return _chd_search(chd->packed_chd_phf, chd->packed_cr, key, keylen); +} + +void chd_pack(cmph_t *mphf, void *packed_mphf) +{ + chd_data_t *data = (chd_data_t *)mphf->data; + cmph_uint32 * ptr = packed_mphf; + cmph_uint8 * ptr8; + + // packing packed_cr_size and packed_cr + *ptr = data->packed_cr_size; + ptr8 = (cmph_uint8 *) (ptr + 1); + + memcpy(ptr8, data->packed_cr, data->packed_cr_size); + ptr8 += data->packed_cr_size; + + ptr = (cmph_uint32 *) ptr8; + *ptr = data->packed_chd_phf_size; + + ptr8 = (cmph_uint8 *) (ptr + 1); + memcpy(ptr8, data->packed_chd_phf, data->packed_chd_phf_size); +} + +cmph_uint32 chd_packed_size(cmph_t *mphf) +{ + register chd_data_t *data = (chd_data_t *)mphf->data; + return (sizeof(CMPH_ALGO) + 2*sizeof(cmph_uint32) + data->packed_cr_size + data->packed_chd_phf_size); + +} + +cmph_uint32 chd_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + + register cmph_uint32 * ptr = packed_mphf; + register cmph_uint32 packed_cr_size = *ptr++; + register cmph_uint8 * packed_chd_phf = ((cmph_uint8 *) ptr) + packed_cr_size + sizeof(cmph_uint32); + return _chd_search(packed_chd_phf, ptr, key, keylen); +} + + diff --git a/src/chd.h b/src/chd.h new file mode 100644 index 0000000..e829df8 --- /dev/null +++ b/src/chd.h @@ -0,0 +1,59 @@ +#ifndef _CMPH_CHD_H__ +#define _CMPH_CHD_H__ + +#include "cmph.h" + +typedef struct __chd_data_t chd_data_t; +typedef struct __chd_config_data_t chd_config_data_t; + +/* Config API */ +chd_config_data_t *chd_config_new(cmph_config_t * mph); +void chd_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs); + +/** \fn void chd_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin); + * \brief Allows to set the number of keys per bin. + * \param mph pointer to the configuration structure + * \param keys_per_bin value for the number of keys per bin + */ +void chd_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin); + +/** \fn void chd_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket); + * \brief Allows to set the number of keys per bucket. + * \param mph pointer to the configuration structure + * \param keys_per_bucket value for the number of keys per bucket + */ +void chd_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket); +void chd_config_destroy(cmph_config_t *mph); + + +/* Chd algorithm API */ +cmph_t *chd_new(cmph_config_t *mph, double c); +void chd_load(FILE *fd, cmph_t *mphf); +int chd_dump(cmph_t *mphf, FILE *fd); +void chd_destroy(cmph_t *mphf); +cmph_uint32 chd_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** \fn void chd_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void chd_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 chd_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 chd_packed_size(cmph_t *mphf); + +/** cmph_uint32 chd_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 chd_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + +#endif diff --git a/src/chd_ph.c b/src/chd_ph.c index fe3511d..68fe6c7 100644 --- a/src/chd_ph.c +++ b/src/chd_ph.c @@ -11,7 +11,7 @@ #include "chd_ph.h" #include"miller_rabin.h" -#define DEBUG +//#define DEBUG #include "debug.h" // NO_ELEMENT is equivalent to null pointer @@ -99,8 +99,7 @@ static inline cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * bu static inline cmph_uint32 * chd_ph_ordering(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets, cmph_uint32 max_bucket_size); static inline cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_bucket_size, - cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table, - cmph_uint8 * occup_table); + cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table); static inline double chd_ph_space_lower_bound(cmph_uint32 _n, cmph_uint32 _r) { @@ -146,10 +145,8 @@ chd_ph_config_data_t *chd_ph_config_new() chd_ph->use_h = 1; chd_ph->keys_per_bin = 1; chd_ph->keys_per_bucket = 4; + chd_ph->occup_table = 0; - //The following fields are used just for statistics - chd_ph->space_usage = 0; - chd_ph->entropy = 0.0; return chd_ph; } @@ -157,6 +154,11 @@ void chd_ph_config_destroy(cmph_config_t *mph) { chd_ph_config_data_t *data = (chd_ph_config_data_t *) mph->data; DEBUGP("Destroying algorithm dependent data\n"); + if(data->occup_table) + { + free(data->occup_table); + data->occup_table = NULL; + } free(data); } @@ -286,8 +288,8 @@ cmph_uint32 * chd_ph_ordering(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets, c return sorted_lists; } -static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint8 * occup_table, - cmph_uint32 probe0_num, cmph_uint32 probe1_num, cmph_uint32 bucket_num) +static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 probe0_num, + cmph_uint32 probe1_num, cmph_uint32 bucket_num) { register cmph_uint32 i; register cmph_uint32 size = buckets[bucket_num].size; @@ -300,11 +302,11 @@ static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph { position = (item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n; - if(occup_table[position] >= chd_ph->keys_per_bin) + if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin) { break; } - occup_table[position]++; + (chd_ph->occup_table[position])++; item = item->next; }; @@ -319,7 +321,7 @@ static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph break; } position = (item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n; - occup_table[position]--; + (chd_ph->occup_table[position])--; item = item->next; i--; }; @@ -329,7 +331,7 @@ static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph }; static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_probes, - cmph_uint32 * disp_table, cmph_uint8 * occup_table, cmph_uint32 bucket_num) + cmph_uint32 * disp_table, cmph_uint32 bucket_num) { register cmph_uint32 probe0_num, probe1_num, probe_num; @@ -339,7 +341,7 @@ static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucke while(1) { - if(place_bucket_probe(chd_ph, buckets, occup_table, probe0_num, probe1_num, bucket_num)) + if(place_bucket_probe(chd_ph, buckets, probe0_num, probe1_num, bucket_num)) { disp_table[bucket_num] = probe0_num + probe1_num * chd_ph->n; return 1; @@ -360,8 +362,7 @@ static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucke }; static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_bucket_size, - cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table, - cmph_uint8 * occup_table) + cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table) { register cmph_uint32 i = 0; register cmph_uint32 curr_bucket = 0; @@ -371,7 +372,7 @@ static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_buc curr_bucket = sorted_lists[i]; while(curr_bucket != NO_ELEMENT) { - if(!place_bucket(chd_ph, buckets, max_probes, disp_table, occup_table, curr_bucket)) + if(!place_bucket(chd_ph, buckets, max_probes, disp_table, curr_bucket)) { return 0; } @@ -383,8 +384,7 @@ static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_buc }; static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_bucket_size, - cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table, - cmph_uint8 * occup_table) + cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table) { register cmph_uint32 i; register cmph_uint32 curr_bucket, prev_bucket; @@ -402,7 +402,7 @@ static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_buc while(curr_bucket != NO_ELEMENT) { // if bucket is successfully placed remove it from list - if(place_bucket_probe(chd_ph, buckets, occup_table, probe0_num, probe1_num, curr_bucket)) + if(place_bucket_probe(chd_ph, buckets, probe0_num, probe1_num, curr_bucket)) { disp_table[curr_bucket] = probe0_num + probe1_num * chd_ph->n; // DEBUGP("BUCKET %u PLACED --- DISPLACEMENT = %u\n", curr_bucket, disp_table[curr_bucket]); @@ -440,29 +440,28 @@ static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_buc }; cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_bucket_size, - cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table, - cmph_uint8 * occup_table) + cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table) { if(chd_ph->use_h) { - return place_buckets2(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table, occup_table); + return place_buckets2(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table); } else { - return place_buckets1(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table, occup_table); + return place_buckets1(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table); } } static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, - cmph_uint32 * disp_table, cmph_uint8 * occup_table) + cmph_uint32 * disp_table) { register cmph_uint32 i, j; register cmph_uint32 position, probe0_num, probe1_num; register cmph_uint32 m = 0; register chd_ph_item_t * item; - memset(occup_table, 0, chd_ph->n); + memset(chd_ph->occup_table, 0, chd_ph->n); for(i = 0; i < chd_ph->nbuckets; i++) { j = buckets[i].size; @@ -477,11 +476,11 @@ static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph, } m++; position = (item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n; - if(occup_table[position] >= chd_ph->keys_per_bin) + if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin) { return 0; } - occup_table[position]++; + (chd_ph->occup_table[position])++; item = item->next; }; }; @@ -498,7 +497,7 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) register double load_factor = c; register cmph_uint8 searching_success = 0; - register cmph_uint32 max_probes = 1 << 18; // default value for max_probes + register cmph_uint32 max_probes = 1 << 20; // default value for max_probes register cmph_uint32 iterations = 100; chd_ph_bucket_t * buckets = NULL; chd_ph_item_t * items = NULL; @@ -506,8 +505,14 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) cmph_uint32 max_bucket_size = 0; cmph_uint32 * sorted_lists = NULL; cmph_uint32 * disp_table = NULL; - cmph_uint8 * occup_table; - + register double space_lower_bound = 0; + #ifdef CMPH_TIMING + double construction_time_begin = 0.0; + double construction_time = 0.0; + ELAPSED_TIME_IN_SECONDS(&construction_time_begin); + #endif + + chd_ph->m = mph->key_source->nkeys; DEBUGP("m = %u\n", chd_ph->m); @@ -539,10 +544,14 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) }; DEBUGP("n = %u \n", chd_ph->n); - - if(mph->verbosity && chd_ph->keys_per_bin == 1) + if(chd_ph->keys_per_bin == 1) { - fprintf(stderr, "space lower bound is %.3f bits per key\n", chd_ph_space_lower_bound(chd_ph->m, chd_ph->n)); + space_lower_bound = chd_ph_space_lower_bound(chd_ph->m, chd_ph->n); + } + + if(mph->verbosity) + { + fprintf(stderr, "space lower bound is %.3f bits per key\n", space_lower_bound); } // We allocate the working tables @@ -550,7 +559,7 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) items = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t)); max_probes = (cmph_uint32)(((log(chd_ph->m)/log(2))/20) * max_probes); - occup_table = (cmph_uint8 *) calloc(chd_ph->n, sizeof(cmph_uint8)); + chd_ph->occup_table = (cmph_uint8 *) calloc(chd_ph->n, sizeof(cmph_uint8)); disp_table = (cmph_uint32 *) calloc(chd_ph->nbuckets, sizeof(cmph_uint32)); // // init_genrand(time(0)); @@ -588,12 +597,12 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) fprintf(stderr, "Starting searching step\n"); } - searching_success = chd_ph_searching(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table, occup_table); + searching_success = chd_ph_searching(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table); if(searching_success) break; // reset occup_table - memset(occup_table, 0, chd_ph->n); + memset(chd_ph->occup_table, 0, chd_ph->n); if(iterations == 0) { // Cleanup memory @@ -606,16 +615,15 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) }; } - #ifdef DEBUG - chd_ph->entropy = chd_ph_get_entropy(disp_table, chd_ph->nbuckets, max_probes); - DEBUGP("Entropy = %.4f\n", chd_ph->entropy/chd_ph->m); - - if(!chd_ph_check_bin_hashing(chd_ph, buckets, disp_table, occup_table)) + #ifdef DEBUG { - - DEBUGP("Error for bin packing generation"); - return NULL; - }; + if(!chd_ph_check_bin_hashing(chd_ph, buckets, disp_table)) + { + + DEBUGP("Error for bin packing generation"); + return NULL; + } + } #endif if (mph->verbosity) @@ -630,16 +638,18 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t)); compressed_seq_init(chd_ph->cs); compressed_seq_generate(chd_ph->cs, disp_table, chd_ph->nbuckets); - chd_ph->space_usage = compressed_seq_get_space_usage(chd_ph->cs); - chd_ph->space_usage += 64; - DEBUGP("space_usage/key = %.4f\n", chd_ph->space_usage/(double)chd_ph->m); + #ifdef CMPH_TIMING + ELAPSED_TIME_IN_SECONDS(&construction_time); + register double entropy = chd_ph_get_entropy(disp_table, chd_ph->nbuckets, max_probes); + DEBUGP("Entropy = %.4f\n", entropy/chd_ph->m); + #endif + cleanup: chd_ph_bucket_destroy(buckets); free(items); free(sorted_lists); free(disp_table); - free(occup_table); if(failure) { if(chd_ph->hl) @@ -669,6 +679,12 @@ cleanup: { fprintf(stderr, "Successfully generated minimal perfect hash function\n"); } + + #ifdef CMPH_TIMING + register cmph_uint32 space_usage = chd_ph_packed_size(mphf)*8; + construction_time = construction_time - construction_time_begin; + fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\t%.4f\t%.4f\n", chd_ph->m, load_factor, chd_ph->keys_per_bucket, construction_time, space_usage/(double)chd_ph->m, space_lower_bound, entropy/chd_ph->m); + #endif return mphf; } diff --git a/src/chd_structs.h b/src/chd_structs.h new file mode 100644 index 0000000..d62f682 --- /dev/null +++ b/src/chd_structs.h @@ -0,0 +1,21 @@ +#ifndef __CMPH_CHD_STRUCTS_H__ +#define __CMPH_CHD_STRUCTS_H__ + +#include "chd_structs_ph.h" +#include "chd_ph.h" +#include "compressed_rank.h" + +struct __chd_data_t +{ + cmph_uint32 packed_cr_size; + cmph_uint8 * packed_cr; // packed compressed rank structure to control the number of zeros in a bit vector + + cmph_uint32 packed_chd_phf_size; + cmph_uint8 * packed_chd_phf; +}; + +struct __chd_config_data_t +{ + cmph_config_t *chd_ph; // chd_ph algorithm must be used here +}; +#endif diff --git a/src/chd_structs_ph.h b/src/chd_structs_ph.h index c93b6bd..d869218 100644 --- a/src/chd_structs_ph.h +++ b/src/chd_structs_ph.h @@ -24,9 +24,6 @@ struct __chd_ph_config_data_t cmph_uint8 use_h; // flag to indicate the of use of a heuristic (use_h = 1) cmph_uint32 keys_per_bin;//maximum number of keys per bin cmph_uint32 keys_per_bucket; // average number of keys per bucket - - //The following fields are used just for statistics - cmph_uint32 space_usage; - double entropy; + cmph_uint8 *occup_table; // table that indicates occupied positions }; #endif diff --git a/src/chm.c b/src/chm.c index 5669080..54561f3 100644 --- a/src/chm.c +++ b/src/chm.c @@ -225,11 +225,11 @@ int chm_dump(cmph_t *mphf, FILE *fd) nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); nbytes = fwrite(data->g, sizeof(cmph_uint32)*data->n, (size_t)1, fd); - #ifdef DEBUG +/* #ifdef DEBUG fprintf(stderr, "G: "); for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]); fprintf(stderr, "\n"); - #endif + #endif*/ return 1; } diff --git a/src/cmph.c b/src/cmph.c index 40f16ec..1d87152 100644 --- a/src/cmph.c +++ b/src/cmph.c @@ -8,6 +8,7 @@ #include "bdz.h" /* included -- Fabiano */ #include "bdz_ph.h" /* included -- Fabiano */ #include "chd_ph.h" /* included -- Fabiano */ +#include "chd.h" /* included -- Fabiano */ #include #include @@ -15,7 +16,7 @@ //#define DEBUG #include "debug.h" -const char *cmph_names[] = {"bmz", "bmz8", "chm", "brz", "fch", "bdz", "bdz_ph", "chd_ph", NULL }; /* included -- Fabiano */ +const char *cmph_names[] = {"bmz", "bmz8", "chm", "brz", "fch", "bdz", "bdz_ph", "chd_ph", "chd", NULL }; /* included -- Fabiano */ typedef struct { @@ -325,6 +326,9 @@ void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo) case CMPH_CHD_PH: chd_ph_config_destroy(mph); break; + case CMPH_CHD: + chd_config_destroy(mph); + break; default: assert(0); } @@ -354,6 +358,9 @@ void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo) case CMPH_CHD_PH: mph->data = chd_ph_config_new(); break; + case CMPH_CHD: + mph->data = chd_config_new(mph); + break; default: assert(0); } @@ -392,6 +399,10 @@ void cmph_config_set_b(cmph_config_t *mph, cmph_uint32 b) { chd_ph_config_set_b(mph, b); } + else if (mph->algo == CMPH_CHD) + { + chd_config_set_b(mph, b); + } } void cmph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin) @@ -400,6 +411,10 @@ void cmph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin) { chd_ph_config_set_keys_per_bin(mph, keys_per_bin); } + else if (mph->algo == CMPH_CHD) + { + chd_config_set_keys_per_bin(mph, keys_per_bin); + } } void cmph_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability) @@ -441,6 +456,9 @@ void cmph_config_destroy(cmph_config_t *mph) case CMPH_CHD_PH: /* included -- Fabiano */ chd_ph_config_destroy(mph); break; + case CMPH_CHD: /* included -- Fabiano */ + chd_config_destroy(mph); + break; default: assert(0); } @@ -481,6 +499,9 @@ void cmph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) case CMPH_CHD_PH: /* included -- Fabiano */ chd_ph_config_set_hashfuncs(mph, hashfuncs); break; + case CMPH_CHD: /* included -- Fabiano */ + chd_config_set_hashfuncs(mph, hashfuncs); + break; default: break; } @@ -534,6 +555,10 @@ cmph_t *cmph_new(cmph_config_t *mph) DEBUGP("Creating chd_ph hash\n"); mphf = chd_ph_new(mph, c); break; + case CMPH_CHD: /* included -- Fabiano */ + DEBUGP("Creating chd hash\n"); + mphf = chd_new(mph, c); + break; default: assert(0); } @@ -560,6 +585,8 @@ int cmph_dump(cmph_t *mphf, FILE *f) return bdz_ph_dump(mphf, f); case CMPH_CHD_PH: /* included -- Fabiano */ return chd_ph_dump(mphf, f); + case CMPH_CHD: /* included -- Fabiano */ + return chd_dump(mphf, f); default: assert(0); } @@ -607,6 +634,10 @@ cmph_t *cmph_load(FILE *f) DEBUGP("Loading chd_ph algorithm dependent parts\n"); chd_ph_load(f, mphf); break; + case CMPH_CHD: /* included -- Fabiano */ + DEBUGP("Loading chd algorithm dependent parts\n"); + chd_load(f, mphf); + break; default: assert(0); } @@ -643,6 +674,9 @@ cmph_uint32 cmph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) case CMPH_CHD_PH: /* included -- Fabiano */ DEBUGP("chd_ph algorithm search\n"); return chd_ph_search(mphf, key, keylen); + case CMPH_CHD: /* included -- Fabiano */ + DEBUGP("chd algorithm search\n"); + return chd_search(mphf, key, keylen); default: assert(0); } @@ -683,6 +717,9 @@ void cmph_destroy(cmph_t *mphf) case CMPH_CHD_PH: /* included -- Fabiano */ chd_ph_destroy(mphf); return; + case CMPH_CHD: /* included -- Fabiano */ + chd_destroy(mphf); + return; default: assert(0); } @@ -728,6 +765,9 @@ void cmph_pack(cmph_t *mphf, void *packed_mphf) case CMPH_CHD_PH: /* included -- Fabiano */ chd_ph_pack(mphf, ptr); break; + case CMPH_CHD: /* included -- Fabiano */ + chd_pack(mphf, ptr); + break; default: assert(0); } @@ -759,6 +799,8 @@ cmph_uint32 cmph_packed_size(cmph_t *mphf) return bdz_ph_packed_size(mphf); case CMPH_CHD_PH: /* included -- Fabiano */ return chd_ph_packed_size(mphf); + case CMPH_CHD: /* included -- Fabiano */ + return chd_packed_size(mphf); default: assert(0); } @@ -794,6 +836,8 @@ cmph_uint32 cmph_search_packed(void *packed_mphf, const char *key, cmph_uint32 k return bdz_ph_search_packed(++ptr, key, keylen); case CMPH_CHD_PH: /* included -- Fabiano */ return chd_ph_search_packed(++ptr, key, keylen); + case CMPH_CHD: /* included -- Fabiano */ + return chd_search_packed(++ptr, key, keylen); default: assert(0); } diff --git a/src/cmph.h b/src/cmph.h index 35a77ca..1bc009e 100644 --- a/src/cmph.h +++ b/src/cmph.h @@ -101,6 +101,10 @@ cmph_uint32 cmph_packed_size(cmph_t *mphf); */ cmph_uint32 cmph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); +// TIMING functions. To use the macro CMPH_TIMING must be defined +#include "cmph_time.h" + + #ifdef __cplusplus } #endif diff --git a/src/cmph_time.h b/src/cmph_time.h new file mode 100644 index 0000000..39180ee --- /dev/null +++ b/src/cmph_time.h @@ -0,0 +1,62 @@ +#ifdef ELAPSED_TIME_IN_SECONDS +#undef ELAPSED_TIME_IN_SECONDS +#endif + +#ifdef ELAPSED_TIME_IN_uSECONDS +#undef ELAPSED_TIME_IN_uSECONDS +#endif + +#ifdef WIN32 +// include headers to use gettimeofday +#else + #ifdef __GNUC__ + #include + #include + #endif +#endif + +#ifdef __GNUC__ + #ifndef __CMPH_TIME_H__ + #define __CMPH_TIME_H__ + static inline void elapsed_time_in_seconds(double * elapsed_time) + { + struct timeval e_time; + if (gettimeofday(&e_time, NULL) < 0) { + return; + } + *elapsed_time = (double)e_time.tv_sec + ((double)e_time.tv_usec/1000000.0); + } + static inline void dummy_elapsed_time_in_seconds() + { + } + static inline void elapsed_time_in_useconds(cmph_uint64 * elapsed_time) + { + struct timeval e_time; + if (gettimeofday(&e_time, NULL) < 0) { + return; + } + *elapsed_time = e_time.tv_sec*1000000 + e_time.tv_usec; + } + static inline void dummy_elapsed_time_in_useconds() + { + } + #endif +#endif + +#ifdef CMPH_TIMING + #ifdef __GNUC__ + #define ELAPSED_TIME_IN_SECONDS elapsed_time_in_seconds + #define ELAPSED_TIME_IN_uSECONDS elapsed_time_in_useconds + #else + #define ELAPSED_TIME_IN_SECONDS dummy_elapsed_time_in_seconds + #define ELAPSED_TIME_IN_uSECONDS dummy_elapsed_time_in_useconds + #endif +#else + #ifdef __GNUC__ + #define ELAPSED_TIME_IN_SECONDS + #define ELAPSED_TIME_IN_uSECONDS + #else + #define ELAPSED_TIME_IN_SECONDS dummy_elapsed_time_in_seconds + #define ELAPSED_TIME_IN_uSECONDS dummy_elapsed_time_in_useconds + #endif +#endif diff --git a/src/cmph_types.h b/src/cmph_types.h index fdf9735..894dbde 100644 --- a/src/cmph_types.h +++ b/src/cmph_types.h @@ -35,7 +35,7 @@ typedef unsigned int cmph_uint32; typedef enum { CMPH_HASH_JENKINS, CMPH_HASH_COUNT } CMPH_HASH; extern const char *cmph_hash_names[]; typedef enum { CMPH_BMZ, CMPH_BMZ8, CMPH_CHM, CMPH_BRZ, CMPH_FCH, - CMPH_BDZ, CMPH_BDZ_PH, CMPH_CHD_PH, CMPH_COUNT } CMPH_ALGO; /* included -- Fabiano */ + CMPH_BDZ, CMPH_BDZ_PH, CMPH_CHD_PH, CMPH_CHD, CMPH_COUNT } CMPH_ALGO; /* included -- Fabiano */ extern const char *cmph_names[]; #endif diff --git a/src/compressed_rank.c b/src/compressed_rank.c new file mode 100644 index 0000000..f376d17 --- /dev/null +++ b/src/compressed_rank.c @@ -0,0 +1,321 @@ +#include +#include +#include +#include +#include"compressed_rank.h" +#include"bitbool.h" +// #define DEBUG +#include"debug.h" +static inline cmph_uint32 compressed_rank_i_log2(cmph_uint32 x) +{ + register cmph_uint32 res = 0; + + while(x > 1) + { + x >>= 1; + res++; + } + return res; +}; + +void compressed_rank_init(compressed_rank_t * cr) +{ + cr->max_val = 0; + cr->n = 0; + cr->rem_r = 0; + select_init(&cr->sel); + cr->vals_rems = 0; +} + +void compressed_rank_destroy(compressed_rank_t * cr) +{ + free(cr->vals_rems); + cr->vals_rems = 0; + select_destroy(&cr->sel); +} + +void compressed_rank_generate(compressed_rank_t * cr, cmph_uint32 * vals_table, cmph_uint32 n) +{ + register cmph_uint32 i,j; + register cmph_uint32 rems_mask; + register cmph_uint32 * select_vec = 0; + cr->n = n; + cr->max_val = vals_table[cr->n - 1]; + cr->rem_r = compressed_rank_i_log2(cr->max_val/cr->n); + if(cr->rem_r == 0) + { + cr->rem_r = 1; + } + select_vec = (cmph_uint32 *) calloc(cr->max_val >> cr->rem_r, sizeof(cmph_uint32)); + cr->vals_rems = (cmph_uint32 *) calloc(BITS_TABLE_SIZE(cr->n, cr->rem_r), sizeof(cmph_uint32)); + rems_mask = (1 << cr->rem_r) - 1; + + for(i = 0; i < cr->n; i++) + { + set_bits_value(cr->vals_rems, i, vals_table[i] & rems_mask, cr->rem_r, rems_mask); + } + + for(i = 1, j = 0; i <= cr->max_val >> cr->rem_r; i++) + { + while(i > (vals_table[j] >> cr->rem_r)) + { + j++; + } + select_vec[i - 1] = j; + }; + + + // FABIANO: before it was (cr->total_length >> cr->rem_r) + 1. But I wiped out the + 1 because + // I changed the select structure to work up to m, instead of up to m - 1. + select_generate(&cr->sel, select_vec, cr->max_val >> cr->rem_r, cr->n); + + free(select_vec); +} + +cmph_uint32 compressed_rank_query(compressed_rank_t * cr, cmph_uint32 idx) +{ + register cmph_uint32 rems_mask; + register cmph_uint32 val_quot, val_rem; + register cmph_uint32 sel_res, rank; + + if(idx > cr->max_val) + { + return cr->n; + } + + val_quot = idx >> cr->rem_r; + rems_mask = (1 << cr->rem_r) - 1; + val_rem = idx & rems_mask; + if(val_quot == 0) + { + rank = sel_res = 0; + } + else + { + sel_res = select_query(&cr->sel, val_quot - 1) + 1; + rank = sel_res - val_quot; + } + + do + { + if(GETBIT32(cr->sel.bits_vec, sel_res)) + { + break; + } + if(get_bits_value(cr->vals_rems, rank, cr->rem_r, rems_mask) >= val_rem) + { + break; + } + sel_res++; + rank++; + } while(1); + + return rank; +} + +cmph_uint32 compressed_rank_get_space_usage(compressed_rank_t * cr) +{ + register cmph_uint32 space_usage = select_get_space_usage(&cr->sel); + space_usage += BITS_TABLE_SIZE(cr->n, cr->rem_r)*sizeof(cmph_uint32)*8; + space_usage += 3*sizeof(cmph_uint32)*8; + return space_usage; +} + +void compressed_rank_dump(compressed_rank_t * cr, char **buf, cmph_uint32 *buflen) +{ + register cmph_uint32 sel_size = select_packed_size(&(cr->sel)); + register cmph_uint32 vals_rems_size = BITS_TABLE_SIZE(cr->n, cr->rem_r) * sizeof(cmph_uint32); + register cmph_uint32 pos = 0; + char * buf_sel = 0; + cmph_uint32 buflen_sel = 0; + + *buflen = 4*sizeof(cmph_uint32) + sel_size + vals_rems_size; + + DEBUGP("sel_size = %u\n", sel_size); + DEBUGP("vals_rems_size = %u\n", vals_rems_size); + + *buf = (char *)calloc(*buflen, sizeof(char)); + + if (!*buf) + { + *buflen = UINT_MAX; + return; + } + + // dumping max_val, n and rem_r + memcpy(*buf, &(cr->max_val), sizeof(cmph_uint32)); + pos += sizeof(cmph_uint32); + DEBUGP("max_val = %u\n", cr->max_val); + + memcpy(*buf + pos, &(cr->n), sizeof(cmph_uint32)); + pos += sizeof(cmph_uint32); + DEBUGP("n = %u\n", cr->n); + + memcpy(*buf + pos, &(cr->rem_r), sizeof(cmph_uint32)); + pos += sizeof(cmph_uint32); + DEBUGP("rem_r = %u\n", cr->rem_r); + + // dumping sel + select_dump(&cr->sel, &buf_sel, &buflen_sel); + memcpy(*buf + pos, &buflen_sel, sizeof(cmph_uint32)); + pos += sizeof(cmph_uint32); + DEBUGP("buflen_sel = %u\n", buflen_sel); + + memcpy(*buf + pos, buf_sel, buflen_sel); + + #ifdef DEBUG + cmph_uint32 i = 0; + for(i = 0; i < buflen_sel; i++) + { + DEBUGP("pos = %u -- buf_sel[%u] = %u\n", pos, i, *(*buf + pos + i)); + } + #endif + pos += buflen_sel; + + free(buf_sel); + + // dumping vals_rems + memcpy(*buf + pos, cr->vals_rems, vals_rems_size); + #ifdef DEBUG + for(i = 0; i < vals_rems_size; i++) + { + DEBUGP("pos = %u -- vals_rems_size = %u -- vals_rems[%u] = %u\n", pos, vals_rems_size, i, *(*buf + pos + i)); + } + #endif + pos += vals_rems_size; + + DEBUGP("Dumped compressed rank structure with size %u bytes\n", *buflen); +} + +void compressed_rank_load(compressed_rank_t * cr, const char *buf, cmph_uint32 buflen) +{ + register cmph_uint32 pos = 0; + cmph_uint32 buflen_sel = 0; + register cmph_uint32 vals_rems_size = 0; + + // loading max_val, n, and rem_r + memcpy(&(cr->max_val), buf, sizeof(cmph_uint32)); + pos += sizeof(cmph_uint32); + DEBUGP("max_val = %u\n", cr->max_val); + + memcpy(&(cr->n), buf + pos, sizeof(cmph_uint32)); + pos += sizeof(cmph_uint32); + DEBUGP("n = %u\n", cr->n); + + memcpy(&(cr->rem_r), buf + pos, sizeof(cmph_uint32)); + pos += sizeof(cmph_uint32); + DEBUGP("rem_r = %u\n", cr->rem_r); + + // loading sel + memcpy(&buflen_sel, buf + pos, sizeof(cmph_uint32)); + pos += sizeof(cmph_uint32); + DEBUGP("buflen_sel = %u\n", buflen_sel); + + select_load(&cr->sel, buf + pos, buflen_sel); + #ifdef DEBUG + cmph_uint32 i = 0; + for(i = 0; i < buflen_sel; i++) + { + DEBUGP("pos = %u -- buf_sel[%u] = %u\n", pos, i, *(buf + pos + i)); + } + #endif + pos += buflen_sel; + + // loading vals_rems + if(cr->vals_rems) + { + free(cr->vals_rems); + } + vals_rems_size = BITS_TABLE_SIZE(cr->n, cr->rem_r); + cr->vals_rems = (cmph_uint32 *) calloc(vals_rems_size, sizeof(cmph_uint32)); + vals_rems_size *= 4; + memcpy(cr->vals_rems, buf + pos, vals_rems_size); + + #ifdef DEBUG + for(i = 0; i < vals_rems_size; i++) + { + DEBUGP("pos = %u -- vals_rems_size = %u -- vals_rems[%u] = %u\n", pos, vals_rems_size, i, *(buf + pos + i)); + } + #endif + pos += vals_rems_size; + + DEBUGP("Loaded compressed rank structure with size %u bytes\n", buflen); +} + + + +void compressed_rank_pack(compressed_rank_t *cr, void *cr_packed) +{ + if (cr && cr_packed) + { + char *buf = NULL; + cmph_uint32 buflen = 0; + compressed_rank_dump(cr, &buf, &buflen); + memcpy(cr_packed, buf, buflen); + free(buf); + } +} + +cmph_uint32 compressed_rank_packed_size(compressed_rank_t *cr) +{ + register cmph_uint32 sel_size = select_packed_size(&cr->sel); + register cmph_uint32 vals_rems_size = BITS_TABLE_SIZE(cr->n, cr->rem_r) * sizeof(cmph_uint32); + return 4 * sizeof(cmph_uint32) + sel_size + vals_rems_size; +} + +cmph_uint32 compressed_rank_query_packed(void * cr_packed, cmph_uint32 idx) +{ + // unpacking cr_packed + register cmph_uint32 *ptr = (cmph_uint32 *)cr_packed; + register cmph_uint32 max_val = *ptr++; + register cmph_uint32 n = *ptr++; + register cmph_uint32 rem_r = *ptr++; + register cmph_uint32 buflen_sel = *ptr++; + register cmph_uint32 * sel_packed = ptr; + + register cmph_uint32 * bits_vec = sel_packed + 2; // skipping n and m + + register cmph_uint32 * vals_rems = (ptr += (buflen_sel >> 2)); + + // compressed sequence query computation + register cmph_uint32 rems_mask; + register cmph_uint32 val_quot, val_rem; + register cmph_uint32 sel_res, rank; + + if(idx > max_val) + { + return n; + } + + val_quot = idx >> rem_r; + rems_mask = (1 << rem_r) - 1; + val_rem = idx & rems_mask; + if(val_quot == 0) + { + rank = sel_res = 0; + } + else + { + sel_res = select_query_packed(sel_packed, val_quot - 1) + 1; + rank = sel_res - val_quot; + } + + do + { + if(GETBIT32(bits_vec, sel_res)) + { + break; + } + if(get_bits_value(vals_rems, rank, rem_r, rems_mask) >= val_rem) + { + break; + } + sel_res++; + rank++; + } while(1); + + return rank; +} + + + diff --git a/src/compressed_rank.h b/src/compressed_rank.h new file mode 100644 index 0000000..bfe930d --- /dev/null +++ b/src/compressed_rank.h @@ -0,0 +1,55 @@ +#ifndef __CMPH_COMPRESSED_RANK_H__ +#define __CMPH_COMPRESSED_RANK_H__ + +#include "select.h" + +struct _compressed_rank_t +{ + cmph_uint32 max_val; + cmph_uint32 n; // number of values stored in vals_rems + // The length in bits of each value is decomposed into two compnents: the lg(n) MSBs are stored in rank_select data structure + // the remaining LSBs are stored in a table of n cells, each one of rem_r bits. + cmph_uint32 rem_r; + select_t sel; + cmph_uint32 * vals_rems; +}; + +typedef struct _compressed_rank_t compressed_rank_t; + +void compressed_rank_init(compressed_rank_t * cr); + +void compressed_rank_destroy(compressed_rank_t * cr); + +void compressed_rank_generate(compressed_rank_t * cr, cmph_uint32 * vals_table, cmph_uint32 n); + +cmph_uint32 compressed_rank_query(compressed_rank_t * cr, cmph_uint32 idx); + +cmph_uint32 compressed_rank_get_space_usage(compressed_rank_t * cr); + +void compressed_rank_dump(compressed_rank_t * cr, char **buf, cmph_uint32 *buflen); + +void compressed_rank_load(compressed_rank_t * cr, const char *buf, cmph_uint32 buflen); + + +/** \fn void compressed_rank_pack(compressed_rank_t *cr, void *cr_packed); + * \brief Support the ability to pack a compressed_rank structure into a preallocated contiguous memory space pointed by cr_packed. + * \param cr points to the compressed_rank structure + * \param cr_packed pointer to the contiguous memory area used to store the compressed_rank structure. The size of cr_packed must be at least @see compressed_rank_packed_size + */ +void compressed_rank_pack(compressed_rank_t *cr, void *cr_packed); + +/** \fn cmph_uint32 compressed_rank_packed_size(compressed_rank_t *cr); + * \brief Return the amount of space needed to pack a compressed_rank structure. + * \return the size of the packed compressed_rank structure or zero for failures + */ +cmph_uint32 compressed_rank_packed_size(compressed_rank_t *cr); + + +/** \fn cmph_uint32 compressed_rank_query_packed(void * cr_packed, cmph_uint32 idx); + * \param cr_packed is a pointer to a contiguous memory area + * \param idx is an index to compute the rank + * \return an integer that represents the compressed_rank value. + */ +cmph_uint32 compressed_rank_query_packed(void * cr_packed, cmph_uint32 idx); + +#endif diff --git a/src/compressed_seq.c b/src/compressed_seq.c index fd5001b..84cccef 100644 --- a/src/compressed_seq.c +++ b/src/compressed_seq.c @@ -10,7 +10,7 @@ // #define DEBUG #include "debug.h" -static inline cmph_uint32 i_log2(cmph_uint32 x) +static inline cmph_uint32 compressed_seq_i_log2(cmph_uint32 x) { register cmph_uint32 res = 0; @@ -61,7 +61,7 @@ void compressed_seq_generate(compressed_seq_t * cs, cmph_uint32 * vals_table, cm } else { - lengths[i] = i_log2(vals_table[i] + 1); + lengths[i] = compressed_seq_i_log2(vals_table[i] + 1); cs->total_length += lengths[i]; }; }; @@ -82,7 +82,12 @@ void compressed_seq_generate(compressed_seq_t * cs, cmph_uint32 * vals_table, cm cs->total_length += lengths[i]; }; - cs->rem_r = i_log2(cs->total_length/cs->n); + cs->rem_r = compressed_seq_i_log2(cs->total_length/cs->n); + + if(cs->rem_r == 0) + { + cs->rem_r = 1; + } if(cs->length_rems) { @@ -118,7 +123,7 @@ cmph_uint32 compressed_seq_get_space_usage(compressed_seq_t * cs) return 4 * sizeof(cmph_uint32) * 8 + space_usage; } -cmph_int32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx) +cmph_uint32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx) { register cmph_uint32 enc_idx, enc_length; register cmph_uint32 rems_mask; @@ -156,7 +161,7 @@ cmph_int32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx) void compressed_seq_dump(compressed_seq_t * cs, char ** buf, cmph_uint32 * buflen) { - register cmph_uint32 sel_size = select_get_space_usage(&cs->sel) >> 3; + register cmph_uint32 sel_size = select_packed_size(&(cs->sel)); register cmph_uint32 length_rems_size = BITS_TABLE_SIZE(cs->n, cs->rem_r) * 4; register cmph_uint32 store_table_size = ((cs->total_length + 31) >> 5) * 4; register cmph_uint32 pos = 0; @@ -325,7 +330,7 @@ cmph_uint32 compressed_seq_packed_size(compressed_seq_t *cs) } -cmph_int32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx) +cmph_uint32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx) { // unpacking cs_packed register cmph_uint32 *ptr = (cmph_uint32 *)cs_packed; diff --git a/src/compressed_seq.h b/src/compressed_seq.h index 8f53665..8d87fc7 100644 --- a/src/compressed_seq.h +++ b/src/compressed_seq.h @@ -38,13 +38,13 @@ void compressed_seq_destroy(compressed_seq_t * cs); void compressed_seq_generate(compressed_seq_t * cs, cmph_uint32 * vals_table, cmph_uint32 n); -/** \fn cmph_int32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx); +/** \fn cmph_uint32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx); * \brief Returns the value stored at index @see idx of the compressed sequence structure. * \param cs points to the compressed sequence structure * \param idx index to retrieve the value from * \return the value stored at index @see idx of the compressed sequence structure */ -cmph_int32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx); +cmph_uint32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx); /** \fn cmph_uint32 compressed_seq_get_space_usage(compressed_seq_t * cs); @@ -73,12 +73,12 @@ void compressed_seq_pack(compressed_seq_t *cs, void *cs_packed); cmph_uint32 compressed_seq_packed_size(compressed_seq_t *cs); -/** \fn cmph_int32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx); +/** \fn cmph_uint32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx); * \brief Returns the value stored at index @see idx of the packed compressed sequence structure. * \param cs_packed is a pointer to a contiguous memory area * \param idx is the index to retrieve the value from * \return the value stored at index @see idx of the packed compressed sequence structure */ -cmph_int32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx); +cmph_uint32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx); #endif diff --git a/src/select.c b/src/select.c index fdf127c..3496b77 100644 --- a/src/select.c +++ b/src/select.c @@ -164,7 +164,7 @@ void select_generate(select_t * sel, cmph_uint32 * keys_vec, cmph_uint32 n, cmph select_generate_sel_table(sel); }; -static inline cmph_int32 _select_query(cmph_uint8 * bits_table, cmph_uint32 * select_table, cmph_uint32 one_idx) +static inline cmph_uint32 _select_query(cmph_uint8 * bits_table, cmph_uint32 * select_table, cmph_uint32 one_idx) { register cmph_uint32 vec_bit_idx ,vec_byte_idx; register cmph_uint32 part_sum, old_part_sum; @@ -187,13 +187,13 @@ static inline cmph_int32 _select_query(cmph_uint8 * bits_table, cmph_uint32 * se return select_lookup_table[bits_table[vec_byte_idx - 1]][one_idx - old_part_sum] + ((vec_byte_idx-1) << 3); } -cmph_int32 select_query(select_t * sel, cmph_uint32 one_idx) +cmph_uint32 select_query(select_t * sel, cmph_uint32 one_idx) { return _select_query((cmph_uint8 *)sel->bits_vec, sel->select_table, one_idx); }; -static inline cmph_int32 _select_next_query(cmph_uint8 * bits_table, cmph_uint32 vec_bit_idx) +static inline cmph_uint32 _select_next_query(cmph_uint8 * bits_table, cmph_uint32 vec_bit_idx) { register cmph_uint32 vec_byte_idx, one_idx; register cmph_uint32 part_sum, old_part_sum; @@ -214,7 +214,7 @@ static inline cmph_int32 _select_next_query(cmph_uint8 * bits_table, cmph_uint32 return select_lookup_table[bits_table[(vec_byte_idx - 1)]][(one_idx - old_part_sum)] + ((vec_byte_idx - 1) << 3); } -cmph_int32 select_next_query(select_t * sel, cmph_uint32 vec_bit_idx) +cmph_uint32 select_next_query(select_t * sel, cmph_uint32 vec_bit_idx) { return _select_next_query((cmph_uint8 *)sel->bits_vec, vec_bit_idx); }; @@ -315,12 +315,7 @@ cmph_uint32 select_packed_size(select_t *sel) -/** \fn cmph_int32 select_query_packed(void * sel_packed, cmph_uint32 idx); - * \param sel_packed is a pointer to a contiguous memory area - * \param idx is the rank for which we want to calculate the inverse function select - * \return an integer that represents the select value of rank idx. - */ -cmph_int32 select_query_packed(void * sel_packed, cmph_uint32 one_idx) +cmph_uint32 select_query_packed(void * sel_packed, cmph_uint32 one_idx) { register cmph_uint32 *ptr = (cmph_uint32 *)sel_packed; register cmph_uint32 n = *ptr++; @@ -334,12 +329,7 @@ cmph_int32 select_query_packed(void * sel_packed, cmph_uint32 one_idx) } -/** \fn cmph_int32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx); - * \param sel_packed is a pointer to a contiguous memory area - * \param vec_bit_idx is a value prior computed by @see select_query_packed - * \return an integer that represents the next select value greater than @see vec_bit_idx. - */ -cmph_int32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx) +cmph_uint32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx) { register cmph_uint8 * bits_vec = (cmph_uint8 *)sel_packed; bits_vec += 8; // skipping n and m diff --git a/src/select.h b/src/select.h index a193ac2..a31eb0f 100644 --- a/src/select.h +++ b/src/select.h @@ -18,9 +18,9 @@ void select_destroy(select_t * sel); void select_generate(select_t * sel, cmph_uint32 * keys_vec, cmph_uint32 n, cmph_uint32 m); -cmph_int32 select_query(select_t * sel, cmph_uint32 one_idx); +cmph_uint32 select_query(select_t * sel, cmph_uint32 one_idx); -cmph_int32 select_next_query(select_t * sel, cmph_uint32 vec_bit_idx); +cmph_uint32 select_next_query(select_t * sel, cmph_uint32 vec_bit_idx); cmph_uint32 select_get_space_usage(select_t * sel); @@ -43,19 +43,19 @@ void select_pack(select_t *sel, void *sel_packed); cmph_uint32 select_packed_size(select_t *sel); -/** \fn cmph_int32 select_query_packed(void * sel_packed, cmph_uint32 idx); +/** \fn cmph_uint32 select_query_packed(void * sel_packed, cmph_uint32 one_idx); * \param sel_packed is a pointer to a contiguous memory area * \param one_idx is the rank for which we want to calculate the inverse function select * \return an integer that represents the select value of rank idx. */ -cmph_int32 select_query_packed(void * sel_packed, cmph_uint32 one_idx); +cmph_uint32 select_query_packed(void * sel_packed, cmph_uint32 one_idx); -/** \fn cmph_int32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx); +/** \fn cmph_uint32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx); * \param sel_packed is a pointer to a contiguous memory area * \param vec_bit_idx is a value prior computed by @see select_query_packed * \return an integer that represents the next select value greater than @see vec_bit_idx. */ -cmph_int32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx); +cmph_uint32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx); #endif diff --git a/tests/Makefile.am b/tests/Makefile.am index 54f0b75..a0fe694 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -1,4 +1,4 @@ -noinst_PROGRAMS = graph_tests packed_mphf_tests mphf_tests select_tests compressed_seq_tests +noinst_PROGRAMS = graph_tests packed_mphf_tests mphf_tests select_tests compressed_seq_tests compressed_rank_tests INCLUDES = -I../src/ @@ -16,3 +16,6 @@ select_tests_LDADD = ../src/libcmph.la compressed_seq_tests_SOURCES = compressed_seq_tests.c compressed_seq_tests_LDADD = ../src/libcmph.la + +compressed_rank_tests_SOURCES = compressed_rank_tests.c +compressed_rank_tests_LDADD = ../src/libcmph.la diff --git a/tests/compressed_seq_tests.c b/tests/compressed_seq_tests.c index 3ebb687..f81f794 100644 --- a/tests/compressed_seq_tests.c +++ b/tests/compressed_seq_tests.c @@ -27,7 +27,7 @@ int main(int argc, char **argv) cmph_uint32 i = 0; cmph_uint32 n = 20; cmph_uint32 keys_vec[] = { 0, 1, 1, 1, 2, 2, 2, 3, 5, 5, - 6, 6, 9, 9, 9, 12, 12, 13, 17, 10017}; + 6, 6, 9, 9, 9, 12, 12, 13, 17, 1077}; char *buf = NULL; cmph_uint32 buflen = 0; char * cs_packed = NULL; diff --git a/tests/packed_mphf_tests.c b/tests/packed_mphf_tests.c index f800b97..51336b9 100644 --- a/tests/packed_mphf_tests.c +++ b/tests/packed_mphf_tests.c @@ -154,6 +154,12 @@ int main(int argc, char **argv) // testing the packed function //check all keys + #ifdef CMPH_TIMING + double evaluation_time_begin = 0.0; + double evaluation_time = 0.0; + ELAPSED_TIME_IN_SECONDS(&evaluation_time_begin); + #endif + for (i = 0; i < source->nkeys; ++i) { cmph_uint32 h; @@ -179,6 +185,12 @@ int main(int argc, char **argv) } source->dispose(source->data, buf, buflen); } + #ifdef CMPH_TIMING + ELAPSED_TIME_IN_SECONDS(&evaluation_time); + evaluation_time = evaluation_time - evaluation_time_begin; + fprintf(stdout, "%u\t%.2f\n", source->nkeys, evaluation_time); + #endif + free(packed_mphf); cmph_destroy(mphf); free(hashtable);