diff --git a/src/bmz.c b/src/bmz.c index e4d1c31..7371c0e 100644 --- a/src/bmz.c +++ b/src/bmz.c @@ -66,6 +66,7 @@ cmph_t *bmz_new(cmph_config_t *mph, float c) cmph_uint8 * visited = NULL; bmz_config_data_t *bmz = (bmz_config_data_t *)mph->data; + if (c == 0) c = 1.15; // validating restrictions over parameter c. DEBUGP("c: %f\n", c); bmz->m = mph->key_source->nkeys; bmz->n = ceil(c * mph->key_source->nkeys); diff --git a/src/bmz8.c b/src/bmz8.c index f0690dc..032a3bc 100644 --- a/src/bmz8.c +++ b/src/bmz8.c @@ -70,7 +70,7 @@ cmph_t *bmz8_new(cmph_config_t *mph, float c) if (mph->verbosity) fprintf(stderr, "The number of keys in BMZ8 must be lower than 256.\n"); return NULL; } - + if (c == 0) c = 1.15; // validating restrictions over parameter c. DEBUGP("c: %f\n", c); bmz8->m = mph->key_source->nkeys; bmz8->n = ceil(c * mph->key_source->nkeys); diff --git a/src/brz.c b/src/brz.c index dad31d1..83df9e1 100755 --- a/src/brz.c +++ b/src/brz.c @@ -1,4 +1,6 @@ #include "graph.h" +#include "fch.h" +#include "fch_structs.h" #include "bmz8.h" #include "bmz8_structs.h" #include "brz.h" @@ -20,11 +22,13 @@ static int brz_gen_mphf(cmph_config_t *mph); static cmph_uint32 brz_min_index(cmph_uint32 * vector, cmph_uint32 n); static void brz_destroy_keys_vd(cmph_uint8 ** keys_vd, cmph_uint8 nkeys); -static char * brz_copy_partial_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen); +static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fchf, cmph_uint32 index, cmph_uint32 *buflen); +static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen); brz_config_data_t *brz_config_new() { brz_config_data_t *brz = NULL; brz = (brz_config_data_t *)malloc(sizeof(brz_config_data_t)); + brz->algo = CMPH_BMZ8; brz->b = 128; brz->hashfuncs[0] = CMPH_HASH_JENKINS; brz->hashfuncs[1] = CMPH_HASH_JENKINS; @@ -111,8 +115,17 @@ cmph_t *brz_new(cmph_config_t *mph, float c) cmph_uint32 i; cmph_uint32 iterations = 20; - DEBUGP("c: %f\n"); + DEBUGP("c: %f\n", c); brz_config_data_t *brz = (brz_config_data_t *)mph->data; + switch(brz->algo) // validating restrictions over parameter c. + { + case CMPH_BMZ8: + if (c == 0 || c >= 2.0) c = 1; + break; + case CMPH_FCH: + if (c <= 2.0) c = 2.6; + break; + } brz->c = c; brz->m = mph->key_source->nkeys; DEBUGP("m: %u\n", brz->m); @@ -179,8 +192,9 @@ cmph_t *brz_new(cmph_config_t *mph, float c) brzf->k = brz->k; brzf->c = brz->c; brzf->m = brz->m; + brzf->algo = brz->algo; mphf->data = brzf; - mphf->size = brz->m; + mphf->size = brz->m; DEBUGP("Successfully generated minimal perfect hash\n"); if (mph->verbosity) { @@ -191,7 +205,7 @@ cmph_t *brz_new(cmph_config_t *mph, float c) static int brz_gen_mphf(cmph_config_t *mph) { - cmph_uint32 i, e; + cmph_uint32 i, e, error; brz_config_data_t *brz = (brz_config_data_t *)mph->data; cmph_uint32 memory_usage = 0; cmph_uint32 nkeys_in_buffer = 0; @@ -269,7 +283,7 @@ static int brz_gen_mphf(cmph_config_t *mph) memory_usage += keylen + sizeof(keylen); h0 = hash(brz->h0, key, keylen) % brz->k; - if ((brz->size[h0] == MAX_BUCKET_SIZE) || ((brz->c >= 1.0) && (cmph_uint8)(brz->c * brz->size[h0]) < brz->size[h0])) + if ((brz->size[h0] == MAX_BUCKET_SIZE) || (brz->algo == CMPH_BMZ8 && ((brz->c >= 1.0) && (cmph_uint8)(brz->c * brz->size[h0]) < brz->size[h0]))) { free(buffer); free(buckets_size); @@ -337,6 +351,7 @@ static int brz_gen_mphf(cmph_config_t *mph) fwrite(cmph_names[CMPH_BRZ], (cmph_uint32)(strlen(cmph_names[CMPH_BRZ]) + 1), 1, brz->mphf_fd); fwrite(&(brz->m), sizeof(brz->m), 1, brz->mphf_fd); fwrite(&(brz->c), sizeof(cmph_float32), 1, brz->mphf_fd); + fwrite(&(brz->algo), sizeof(brz->algo), 1, brz->mphf_fd); fwrite(&(brz->k), sizeof(cmph_uint32), 1, brz->mphf_fd); // number of MPHFs fwrite(brz->size, sizeof(cmph_uint8)*(brz->k), 1, brz->mphf_fd); @@ -362,6 +377,7 @@ static int brz_gen_mphf(cmph_config_t *mph) e = 0; keys_vd = (cmph_uint8 **)calloc(MAX_BUCKET_SIZE, sizeof(cmph_uint8 *)); nkeys_vd = 0; + error = 0; while(e < brz->m) { i = brz_min_index(buffer_h0, nflushes); @@ -403,18 +419,49 @@ static int brz_gen_mphf(cmph_config_t *mph) cmph_io_adapter_t *source = NULL; cmph_config_t *config = NULL; cmph_t *mphf_tmp = NULL; - bmz8_data_t * bmzf = NULL; char *bufmphf = NULL; cmph_uint32 buflenmphf = 0; // Source of keys source = cmph_io_byte_vector_adapter(keys_vd, (cmph_uint32)nkeys_vd); config = cmph_config_new(source); - cmph_config_set_algo(config, CMPH_BMZ8); + cmph_config_set_algo(config, brz->algo); + //cmph_config_set_algo(config, CMPH_BMZ8); cmph_config_set_graphsize(config, brz->c); mphf_tmp = cmph_new(config); - bmzf = (bmz8_data_t *)mphf_tmp->data; - bufmphf = brz_copy_partial_mphf(brz, bmzf, cur_bucket, &buflenmphf); - bmzf = NULL; + if (mphf_tmp == NULL) + { + if(mph->verbosity) fprintf(stderr, "ERROR: Can't generate MPHF for bucket %u out of %u\n", cur_bucket + 1, brz->k); + error = 1; + cmph_config_destroy(config); + brz_destroy_keys_vd(keys_vd, nkeys_vd); + cmph_io_byte_vector_adapter_destroy(source); + break; + } + if(mph->verbosity) + { + if (cur_bucket % 1000 == 0) + { + fprintf(stderr, "MPHF for bucket %u out of %u was generated.\n", cur_bucket + 1, brz->k); + } + } + switch(brz->algo) + { + case CMPH_FCH: + { + fch_data_t * fchf = NULL; + fchf = (fch_data_t *)mphf_tmp->data; + bufmphf = brz_copy_partial_fch_mphf(brz, fchf, cur_bucket, &buflenmphf); + } + break; + case CMPH_BMZ8: + { + bmz8_data_t * bmzf = NULL; + bmzf = (bmz8_data_t *)mphf_tmp->data; + bufmphf = brz_copy_partial_bmz8_mphf(brz, bmzf, cur_bucket, &buflenmphf); + } + break; + default: assert(0); + } fwrite(bufmphf, buflenmphf, 1, brz->mphf_fd); free(bufmphf); bufmphf = NULL; @@ -425,11 +472,11 @@ static int brz_gen_mphf(cmph_config_t *mph) nkeys_vd = 0; } } - buffer_manager_destroy(buff_manager); free(keys_vd); free(buffer_merge); free(buffer_h0); + if (error) return 0; return 1; } @@ -449,7 +496,29 @@ static void brz_destroy_keys_vd(cmph_uint8 ** keys_vd, cmph_uint8 nkeys) for(i = 0; i < nkeys; i++) { free(keys_vd[i]); keys_vd[i] = NULL;} } -static char * brz_copy_partial_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen) +static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fchf, cmph_uint32 index, cmph_uint32 *buflen) +{ + cmph_uint32 i = 0; + cmph_uint32 buflenh1 = 0; + cmph_uint32 buflenh2 = 0; + char * bufh1 = NULL; + char * bufh2 = NULL; + char * buf = NULL; + cmph_uint32 n = fchf->b;//brz->size[index]; + hash_state_dump(fchf->h1, &bufh1, &buflenh1); + hash_state_dump(fchf->h2, &bufh2, &buflenh2); + *buflen = buflenh1 + buflenh2 + n + 2*sizeof(cmph_uint32); + buf = (char *)malloc(*buflen); + memcpy(buf, &buflenh1, sizeof(cmph_uint32)); + memcpy(buf+sizeof(cmph_uint32), bufh1, buflenh1); + memcpy(buf+sizeof(cmph_uint32)+buflenh1, &buflenh2, sizeof(cmph_uint32)); + memcpy(buf+2*sizeof(cmph_uint32)+buflenh1, bufh2, buflenh2); + for (i = 0; i < n; i++) memcpy(buf+2*sizeof(cmph_uint32)+buflenh1+buflenh2+i,(fchf->g + i), 1); + free(bufh1); + free(bufh2); + return buf; +} +static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen) { cmph_uint32 buflenh1 = 0; cmph_uint32 buflenh2 = 0; @@ -470,6 +539,8 @@ static char * brz_copy_partial_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, free(bufh2); return buf; } + + int brz_dump(cmph_t *mphf, FILE *fd) { brz_data_t *data = (brz_data_t *)mphf->data; @@ -489,9 +560,6 @@ int brz_dump(cmph_t *mphf, FILE *fd) return 1; } - - - void brz_load(FILE *f, cmph_t *mphf) { char *buf = NULL; @@ -502,31 +570,41 @@ void brz_load(FILE *f, cmph_t *mphf) DEBUGP("Loading brz mphf\n"); mphf->data = brz; fread(&(brz->c), sizeof(cmph_float32), 1, f); + fread(&(brz->algo), sizeof(brz->algo), 1, f); // Reading algo. fread(&(brz->k), sizeof(cmph_uint32), 1, f); brz->size = (cmph_uint8 *) malloc(sizeof(cmph_uint8)*brz->k); fread(brz->size, sizeof(cmph_uint8)*(brz->k), 1, f); brz->h1 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k); brz->h2 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k); brz->g = (cmph_uint8 **) calloc(brz->k, sizeof(cmph_uint8 *)); - DEBUGP("Reading %u h1 and %u h2\n", brz->k, brz->k); + DEBUGP("Reading c = %f k = %u algo = %u \n", brz->c, brz->k, brz->algo); //loading h_i1, h_i2 and g_i. for(i = 0; i < brz->k; i++) { // h1 fread(&buflen, sizeof(cmph_uint32), 1, f); - DEBUGP("Hash state has %u bytes\n", buflen); + DEBUGP("Hash state 1 has %u bytes\n", buflen); buf = (char *)malloc(buflen); fread(buf, buflen, 1, f); brz->h1[i] = hash_state_load(buf, buflen); free(buf); //h2 fread(&buflen, sizeof(cmph_uint32), 1, f); - DEBUGP("Hash state has %u bytes\n", buflen); + DEBUGP("Hash state 2 has %u bytes\n", buflen); buf = (char *)malloc(buflen); fread(buf, buflen, 1, f); brz->h2[i] = hash_state_load(buf, buflen); free(buf); - n = ceil(brz->c * brz->size[i]); + switch(brz->algo) + { + case CMPH_FCH: + n = fch_calc_b(brz->c, brz->size[i]); + break; + case CMPH_BMZ8: + n = ceil(brz->c * brz->size[i]); + break; + default: assert(0); + } DEBUGP("g_i has %u bytes\n", n); brz->g[i] = (cmph_uint8 *)calloc(n, sizeof(cmph_uint8)); fread(brz->g[i], sizeof(cmph_uint8)*n, 1, f); @@ -542,13 +620,12 @@ void brz_load(FILE *f, cmph_t *mphf) //loading c, m, and the vector offset. fread(&(brz->m), sizeof(cmph_uint32), 1, f); brz->offset = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*brz->k); - fread(brz->offset, sizeof(cmph_uint32)*(brz->k), 1, f); + fread(brz->offset, sizeof(cmph_uint32)*(brz->k), 1, f); return; } -cmph_uint32 brz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) +static cmph_uint32 brz_bmz8_search(brz_data_t *brz, const char *key, cmph_uint32 keylen) { - brz_data_t *brz = mphf->data; cmph_uint32 h0 = hash(brz->h0, key, keylen) % brz->k; cmph_uint32 m = brz->size[h0]; cmph_uint32 n = ceil(brz->c * m); @@ -562,6 +639,35 @@ cmph_uint32 brz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) DEBUGP("Address: %u\n", mphf_bucket + brz->offset[h0]); return (mphf_bucket + brz->offset[h0]); } + +static cmph_uint32 brz_fch_search(brz_data_t *brz, const char *key, cmph_uint32 keylen) +{ + cmph_uint32 h0 = hash(brz->h0, key, keylen) % brz->k; + cmph_uint32 m = brz->size[h0]; + cmph_uint32 b = fch_calc_b(brz->c, m); + cmph_float32 p1 = fch_calc_p1(m); + cmph_float32 p2 = fch_calc_p2(b); + cmph_uint32 h1 = hash(brz->h1[h0], key, keylen) % m; + cmph_uint32 h2 = hash(brz->h2[h0], key, keylen) % m; + cmph_uint8 mphf_bucket = 0; + h1 = mixh10h11h12(b, p1, p2, h1); + mphf_bucket = (h2 + brz->g[h0][h1]) % m; + return (mphf_bucket + brz->offset[h0]); +} + +cmph_uint32 brz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) +{ + brz_data_t *brz = mphf->data; + switch(brz->algo) + { + case CMPH_FCH: + return brz_fch_search(brz, key, keylen); + case CMPH_BMZ8: + return brz_bmz8_search(brz, key, keylen); + default: assert(0); + } + return 0; +} void brz_destroy(cmph_t *mphf) { cmph_uint32 i; diff --git a/src/brz_structs.h b/src/brz_structs.h index 537dc12..fa28d3f 100755 --- a/src/brz_structs.h +++ b/src/brz_structs.h @@ -5,6 +5,7 @@ struct __brz_data_t { + CMPH_ALGO algo; // CMPH algo for generating the MPHFs for the buckets (Just CMPH_FCH and CMPH_BMZ8) cmph_uint32 m; // edges (words) count cmph_float32 c; // constant c cmph_uint8 *size; // size[i] stores the number of edges represented by g[i][...]. @@ -19,6 +20,7 @@ struct __brz_data_t struct __brz_config_data_t { CMPH_HASH hashfuncs[3]; + CMPH_ALGO algo; // CMPH algo for generating the MPHFs for the buckets (Just CMPH_FCH and CMPH_BMZ8) cmph_float32 c; // constant c cmph_uint32 m; // edges (words) count cmph_uint8 *size; // size[i] stores the number of edges represented by g[i][...]. diff --git a/src/chm.c b/src/chm.c index 24bcbc8..6caedb5 100644 --- a/src/chm.c +++ b/src/chm.c @@ -59,7 +59,8 @@ cmph_t *chm_new(cmph_config_t *mph, float c) cmph_uint32 iterations = 20; cmph_uint8 *visited = NULL; chm_config_data_t *chm = (chm_config_data_t *)mph->data; - chm->m = mph->key_source->nkeys; + chm->m = mph->key_source->nkeys; + if (c == 0) c = 2.09; chm->n = ceil(c * mph->key_source->nkeys); DEBUGP("m (edges): %u n (vertices): %u c: %f\n", chm->m, chm->n, c); chm->graph = graph_new(chm->n, chm->m); diff --git a/src/cmph.c b/src/cmph.c index 03d00ed..1d7badd 100644 --- a/src/cmph.c +++ b/src/cmph.c @@ -354,27 +354,22 @@ cmph_t *cmph_new(cmph_config_t *mph) { case CMPH_CHM: DEBUGP("Creating chm hash\n"); - if (c == 0) c = 2.09; mphf = chm_new(mph, c); break; case CMPH_BMZ: /* included -- Fabiano */ DEBUGP("Creating bmz hash\n"); - if (c == 0) c = 1.15; mphf = bmz_new(mph, c); break; case CMPH_BMZ8: /* included -- Fabiano */ DEBUGP("Creating bmz8 hash\n"); - if (c == 0) c = 1.15; mphf = bmz8_new(mph, c); break; case CMPH_BRZ: /* included -- Fabiano */ DEBUGP("Creating brz hash\n"); - if (c == 0) c = 1.15; mphf = brz_new(mph, c); break; case CMPH_FCH: /* included -- Fabiano */ DEBUGP("Creating fch hash\n"); - if (c <= 2) c = 2.6; mphf = fch_new(mph, c); break; default: diff --git a/src/fch.c b/src/fch.c index aa9984b..ba0357f 100644 --- a/src/fch.c +++ b/src/fch.c @@ -13,8 +13,6 @@ //#define DEBUG #include "debug.h" -static cmph_uint32 mixh10h11h12(cmph_uint32 b, cmph_float32 p1, cmph_float32 p2, cmph_uint32 initial_index); -static void calc_parameters(fch_config_data_t *fch); static fch_buckets_t * mapping(cmph_config_t *mph); static cmph_uint32 * ordering(fch_buckets_t * buckets); static cmph_uint8 check_for_collisions_h2(fch_config_data_t *fch, fch_buckets_t * buckets, cmph_uint32 *sorted_indexes); @@ -57,7 +55,7 @@ void fch_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) } } -static cmph_uint32 mixh10h11h12(cmph_uint32 b, cmph_float32 p1, cmph_float32 p2, cmph_uint32 initial_index) +cmph_uint32 mixh10h11h12(cmph_uint32 b, cmph_float32 p1, cmph_float32 p2, cmph_uint32 initial_index) { if (initial_index < p1) initial_index %= (cmph_uint32)p2; /* h11 o h10 */ else { /* h12 o h10 */ @@ -67,11 +65,20 @@ static cmph_uint32 mixh10h11h12(cmph_uint32 b, cmph_float32 p1, cmph_float32 p2, return initial_index; } -static void calc_parameters(fch_config_data_t *fch) + +cmph_uint32 fch_calc_b(cmph_float32 c, cmph_uint32 m) { - fch->b = (cmph_uint32)ceil((fch->c*fch->m)/(log(fch->m)/log(2) + 1)); - fch->p1 = ceil(0.55*fch->m); - fch->p2 = ceil(0.3*fch->b); + return (cmph_uint32)ceil((c*m)/(log(m)/log(2) + 1)); +} + +cmph_float32 fch_calc_p1(cmph_uint32 m) +{ + return ceil(0.55*m); +} + +cmph_float32 fch_calc_p2(cmph_uint32 b) +{ + return ceil(0.3*b); } static fch_buckets_t * mapping(cmph_config_t *mph) @@ -81,7 +88,9 @@ static fch_buckets_t * mapping(cmph_config_t *mph) fch_config_data_t *fch = (fch_config_data_t *)mph->data; if (fch->h1) hash_state_destroy(fch->h1); fch->h1 = hash_state_new(fch->hashfuncs[0], fch->m); - calc_parameters (fch); + fch->b = fch_calc_b(fch->c, fch->m); + fch->p1 = fch_calc_p1(fch->m); + fch->p2 = fch_calc_p2(fch->b); //DEBUGP("b:%u p1:%f p2:%f\n", fch->b, fch->p1, fch->p2); buckets = fch_buckets_new(fch->b); @@ -247,6 +256,7 @@ cmph_t *fch_new(cmph_config_t *mph, float c) fch_config_data_t *fch = (fch_config_data_t *)mph->data; fch->m = mph->key_source->nkeys; //DEBUGP("m: %f\n", fch->m); + if (c <= 2) c = 2.6; // validating restrictions over parameter c. fch->c = c; //DEBUGP("c: %f\n", fch->c); fch->h1 = NULL; @@ -389,7 +399,6 @@ cmph_uint32 fch_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) fch_data_t *fch = mphf->data; cmph_uint32 h1 = hash(fch->h1, key, keylen) % fch->m; cmph_uint32 h2 = hash(fch->h2, key, keylen) % fch->m; - h1 = hash(fch->h1, key, keylen) % fch->m; h1 = mixh10h11h12 (fch->b, fch->p1, fch->p2, h1); //DEBUGP("key: %s h1: %u h2: %u g[h1]: %u\n", key, h1, h2, fch->g[h1]); return (h2 + fch->g[h1]) % fch->m; diff --git a/src/fch.h b/src/fch.h index e56855c..5ce2811 100644 --- a/src/fch.h +++ b/src/fch.h @@ -6,6 +6,12 @@ typedef struct __fch_data_t fch_data_t; typedef struct __fch_config_data_t fch_config_data_t; +/* Parameters calculation */ +cmph_uint32 fch_calc_b(cmph_float32 c, cmph_uint32 m); +cmph_float32 fch_calc_p1(cmph_uint32 m); +cmph_float32 fch_calc_p2(cmph_uint32 b); +cmph_uint32 mixh10h11h12(cmph_uint32 b, cmph_float32 p1, cmph_float32 p2, cmph_uint32 initial_index); + fch_config_data_t *fch_config_new(); void fch_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs); void fch_config_destroy(cmph_config_t *mph); diff --git a/src/main.c b/src/main.c index 11459dc..ee3c954 100644 --- a/src/main.c +++ b/src/main.c @@ -217,7 +217,6 @@ int main(int argc, char **argv) if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL); if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd); else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys); - if (generate) { //Create mphf @@ -230,7 +229,8 @@ int main(int argc, char **argv) cmph_config_set_mphf_fd(config, mphf_fd); cmph_config_set_memory_availability(config, memory_availability); cmph_config_set_b(config, b); - if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15; + //if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15; + if(mph_algo == CMPH_BMZ && c >= 2.0) c=1.15; if (c != 0) cmph_config_set_graphsize(config, c); mphf = cmph_new(config); cmph_config_destroy(config);