From 9f999ef428b35149ceb7233ad423473a3c204cb3 Mon Sep 17 00:00:00 2001 From: "Fabiano C. Botelho" Date: Sat, 20 Apr 2013 01:28:00 -0700 Subject: [PATCH] Remaining part of the fix for bug 3465649. This one fixes both BRZ and CHD_PH for small key sets. --- examples/Makefile.am | 6 +-- .../{small_set_test.c => small_set_ex4.c} | 33 +++++++++++- src/bmz.c | 6 +++ src/bmz8.c | 6 +++ src/brz.c | 52 +++++++++++++++---- src/brz.h | 15 ++++++ src/chd_ph.c | 11 +++- 7 files changed, 112 insertions(+), 17 deletions(-) rename examples/{small_set_test.c => small_set_ex4.c} (68%) diff --git a/examples/Makefile.am b/examples/Makefile.am index 5439bf5..69593d8 100755 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -1,4 +1,4 @@ -noinst_PROGRAMS = vector_adapter_ex1 file_adapter_ex2 struct_vector_adapter_ex3 small_set_test +noinst_PROGRAMS = vector_adapter_ex1 file_adapter_ex2 struct_vector_adapter_ex3 small_set_ex4 INCLUDES = -I../src/ @@ -11,5 +11,5 @@ file_adapter_ex2_SOURCES = file_adapter_ex2.c struct_vector_adapter_ex3_LDADD = ../src/libcmph.la struct_vector_adapter_ex3_SOURCES = struct_vector_adapter_ex3.c -small_set_test_LDADD = ../src/libcmph.la -small_set_test_SOURCES = small_set_test.c +small_set_ex4_LDADD = ../src/libcmph.la +small_set_ex4_SOURCES = small_set_ex4.c diff --git a/examples/small_set_test.c b/examples/small_set_ex4.c similarity index 68% rename from examples/small_set_test.c rename to examples/small_set_ex4.c index 31fb537..dc77a05 100644 --- a/examples/small_set_test.c +++ b/examples/small_set_ex4.c @@ -6,6 +6,8 @@ int test(cmph_uint32* items_to_hash, cmph_uint32 items_len, CMPH_ALGO alg_n) cmph_config_t *config; cmph_io_adapter_t *source; cmph_uint32 i; + char filename[256]; + FILE* mphf_fd = NULL; printf("%s (%u)\n", cmph_names[alg_n], alg_n); @@ -16,9 +18,21 @@ int test(cmph_uint32* items_to_hash, cmph_uint32 items_len, CMPH_ALGO alg_n) items_len); config = cmph_config_new(source); cmph_config_set_algo(config, alg_n); + if (alg_n == CMPH_BRZ) { + sprintf(filename, "%s_%u.mph", cmph_names[alg_n], items_len); + mphf_fd = fopen(filename, "w"); + cmph_config_set_mphf_fd(config, mphf_fd); + } hash = cmph_new(config); cmph_config_destroy(config); + if (alg_n == CMPH_BRZ) { + cmph_dump(hash, mphf_fd); + cmph_destroy(hash); + fclose(mphf_fd); + mphf_fd = fopen(filename, "r"); + hash = cmph_load(mphf_fd); + } printf("packed_size %u\n",cmph_packed_size(hash)); for (i=0; im = mph->key_source->nkeys; bmz->n = (cmph_uint32)ceil(c * mph->key_source->nkeys); + + if (bmz->n < 5) // workaround for small key sets + { + bmz->n = 5; + } + DEBUGP("m (edges): %u n (vertices): %u c: %f\n", bmz->m, bmz->n, c); bmz->graph = graph_new(bmz->n, bmz->m); DEBUGP("Created graph\n"); diff --git a/src/bmz8.c b/src/bmz8.c index dc981df..894463d 100644 --- a/src/bmz8.c +++ b/src/bmz8.c @@ -74,6 +74,12 @@ cmph_t *bmz8_new(cmph_config_t *mph, double c) DEBUGP("c: %f\n", c); bmz8->m = (cmph_uint8) mph->key_source->nkeys; bmz8->n = (cmph_uint8) ceil(c * mph->key_source->nkeys); + + if (bmz8->n < 5) // workaround for small key sets + { + bmz8->n = 5; + } + DEBUGP("m (edges): %u n (vertices): %u c: %f\n", bmz8->m, bmz8->n, c); bmz8->graph = graph_new(bmz8->n, bmz8->m); DEBUGP("Created graph\n"); diff --git a/src/brz.c b/src/brz.c index 885db9d..1a7a729 100755 --- a/src/brz.c +++ b/src/brz.c @@ -27,9 +27,9 @@ static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * b brz_config_data_t *brz_config_new(void) { brz_config_data_t *brz = NULL; - brz = (brz_config_data_t *)malloc(sizeof(brz_config_data_t)); - if (!brz) return NULL; - brz->algo = CMPH_FCH; + brz = (brz_config_data_t *)malloc(sizeof(brz_config_data_t)); + if (!brz) return NULL; + brz->algo = CMPH_FCH; brz->b = 128; brz->hashfuncs[0] = CMPH_HASH_JENKINS; brz->hashfuncs[1] = CMPH_HASH_JENKINS; @@ -131,6 +131,15 @@ cmph_t *brz_new(cmph_config_t *mph, double c) DEBUGP("c: %f\n", c); brz_config_data_t *brz = (brz_config_data_t *)mph->data; + + // Since we keep dumping partial pieces of the MPHF as it gets created + // the caller must set the file to store the resulting MPHF before calling + // this function. + if (brz->mphf_fd == NULL) + { + return NULL; + } + switch(brz->algo) // validating restrictions over parameter c. { case CMPH_BMZ8: @@ -144,6 +153,11 @@ cmph_t *brz_new(cmph_config_t *mph, double c) } brz->c = c; brz->m = mph->key_source->nkeys; + if (brz->m < 5) + { + brz->c = 5; + } + DEBUGP("m: %u\n", brz->m); brz->k = (cmph_uint32)ceil(brz->m/((double)brz->b)); DEBUGP("k: %u\n", brz->k); @@ -364,7 +378,7 @@ static int brz_gen_mphf(cmph_config_t *mph) { fprintf(stderr, "\nMPHF generation \n"); } - /* Starting to dump to disk the resultant MPHF: __cmph_dump function */ + /* Starting to dump to disk the resulting MPHF: __cmph_dump function */ nbytes = fwrite(cmph_names[CMPH_BRZ], (size_t)(strlen(cmph_names[CMPH_BRZ]) + 1), (size_t)1, brz->mphf_fd); nbytes = fwrite(&(brz->m), sizeof(brz->m), (size_t)1, brz->mphf_fd); nbytes = fwrite(&(brz->c), sizeof(double), (size_t)1, brz->mphf_fd); @@ -442,7 +456,7 @@ static int brz_gen_mphf(cmph_config_t *mph) source = cmph_io_byte_vector_adapter(keys_vd, (cmph_uint32)nkeys_vd); config = cmph_config_new(source); cmph_config_set_algo(config, brz->algo); - //cmph_config_set_algo(config, CMPH_BMZ8); + cmph_config_set_hashfuncs(config, brz->hashfuncs); cmph_config_set_graphsize(config, brz->c); mphf_tmp = cmph_new(config); if (mphf_tmp == NULL) @@ -565,7 +579,7 @@ int brz_dump(cmph_t *mphf, FILE *fd) cmph_uint32 buflen; register size_t nbytes; DEBUGP("Dumping brzf\n"); - // The initial part of the MPHF have already been dumped to disk during construction + // The initial part of the MPHF has already been dumped to disk during construction // Dumping h0 hash_state_dump(data->h0, &buf, &buflen); DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); @@ -730,7 +744,13 @@ void brz_pack(cmph_t *mphf, void *packed_mphf) brz_data_t *data = (brz_data_t *)mphf->data; cmph_uint8 * ptr = (cmph_uint8 *)packed_mphf; cmph_uint32 i,n; - + + // This assumes that if one function pointer is NULL, + // all the others will be as well. + if (data->h1 == NULL) + { + return; + } // packing internal algo type memcpy(ptr, &(data->algo), sizeof(data->algo)); ptr += sizeof(data->algo); @@ -821,9 +841,21 @@ cmph_uint32 brz_packed_size(cmph_t *mphf) cmph_uint32 i; cmph_uint32 size = 0; brz_data_t *data = (brz_data_t *)mphf->data; - CMPH_HASH h0_type = hash_get_type(data->h0); - CMPH_HASH h1_type = hash_get_type(data->h1[0]); - CMPH_HASH h2_type = hash_get_type(data->h2[0]); + CMPH_HASH h0_type; + CMPH_HASH h1_type; + CMPH_HASH h2_type; + + // This assumes that if one function pointer is NULL, + // all the others will be as well. + if (data->h1 == NULL) + { + return 0U; + } + + h0_type = hash_get_type(data->h0); + h1_type = hash_get_type(data->h1[0]); + h2_type = hash_get_type(data->h2[0]); + size = (cmph_uint32)(2*sizeof(CMPH_ALGO) + 3*sizeof(CMPH_HASH) + hash_state_packed_size(h0_type) + sizeof(cmph_uint32) + sizeof(double) + sizeof(cmph_uint8)*data->k + sizeof(cmph_uint32)*data->k); // pointers to g_is diff --git a/src/brz.h b/src/brz.h index 648f174..df21d77 100644 --- a/src/brz.h +++ b/src/brz.h @@ -3,6 +3,21 @@ #include "cmph.h" +/* + * The BRZ algorithm has been built so to consume the bare minimum + * amount of memory to generate the MPHFs. Thereby we decided + * to dump the resulting MPHFs to disk while creating them. Thus, + * to use the BRZ algorithm, one has to call brz_config_set_mphf_fd + * before calling brz_new. Otherwise we will fail the MPHF creation. + * One side effect of this design decision is that the resulting + * MPHF cannot be used until its dumping process is finalized + * by calling brz_dump and the caller must use brz_load before + * any call to either one of the following functions is made: + * brz_search + * brz_pack + * brz_packed_size + * brz_search_packed + */ typedef struct __brz_data_t brz_data_t; typedef struct __brz_config_data_t brz_config_data_t; diff --git a/src/chd_ph.c b/src/chd_ph.c index 43b936f..fbcd517 100644 --- a/src/chd_ph.c +++ b/src/chd_ph.c @@ -627,7 +627,8 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) register double load_factor = c; register cmph_uint8 searching_success = 0; - register cmph_uint32 max_probes = 1 << 20; // default value for max_probes + register cmph_uint32 max_probes_default = 1 << 20; // default value for max_probes + register cmph_uint32 max_probes; register cmph_uint32 iterations = 100; chd_ph_bucket_t * buckets = NULL; chd_ph_item_t * items = NULL; @@ -688,7 +689,13 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) buckets = chd_ph_bucket_new(chd_ph->nbuckets); items = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t)); - max_probes = (cmph_uint32)(((log(chd_ph->m)/log(2))/20) * max_probes); + max_probes = (cmph_uint32)((log(chd_ph->m)/log(2))/20); + + if (max_probes == 0) { + max_probes = max_probes_default; + } else { + max_probes = max_probes * max_probes_default; + } if(chd_ph->keys_per_bin == 1) chd_ph->occup_table = (cmph_uint8 *) calloc(((chd_ph->n + 31)/32), sizeof(cmph_uint32));