BRZ is working with FCH or BMZ8. BMZ8 is faster but the MPHFs for each bucket are larger

This commit is contained in:
fc_botelho 2006-08-07 14:44:24 +00:00
parent 99f0705fed
commit 5334c9debc
9 changed files with 160 additions and 40 deletions

View File

@ -66,6 +66,7 @@ cmph_t *bmz_new(cmph_config_t *mph, float c)
cmph_uint8 * visited = NULL;
bmz_config_data_t *bmz = (bmz_config_data_t *)mph->data;
if (c == 0) c = 1.15; // validating restrictions over parameter c.
DEBUGP("c: %f\n", c);
bmz->m = mph->key_source->nkeys;
bmz->n = ceil(c * mph->key_source->nkeys);

View File

@ -70,7 +70,7 @@ cmph_t *bmz8_new(cmph_config_t *mph, float c)
if (mph->verbosity) fprintf(stderr, "The number of keys in BMZ8 must be lower than 256.\n");
return NULL;
}
if (c == 0) c = 1.15; // validating restrictions over parameter c.
DEBUGP("c: %f\n", c);
bmz8->m = mph->key_source->nkeys;
bmz8->n = ceil(c * mph->key_source->nkeys);

146
src/brz.c
View File

@ -1,4 +1,6 @@
#include "graph.h"
#include "fch.h"
#include "fch_structs.h"
#include "bmz8.h"
#include "bmz8_structs.h"
#include "brz.h"
@ -20,11 +22,13 @@
static int brz_gen_mphf(cmph_config_t *mph);
static cmph_uint32 brz_min_index(cmph_uint32 * vector, cmph_uint32 n);
static void brz_destroy_keys_vd(cmph_uint8 ** keys_vd, cmph_uint8 nkeys);
static char * brz_copy_partial_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen);
static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fchf, cmph_uint32 index, cmph_uint32 *buflen);
static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen);
brz_config_data_t *brz_config_new()
{
brz_config_data_t *brz = NULL;
brz = (brz_config_data_t *)malloc(sizeof(brz_config_data_t));
brz->algo = CMPH_BMZ8;
brz->b = 128;
brz->hashfuncs[0] = CMPH_HASH_JENKINS;
brz->hashfuncs[1] = CMPH_HASH_JENKINS;
@ -111,8 +115,17 @@ cmph_t *brz_new(cmph_config_t *mph, float c)
cmph_uint32 i;
cmph_uint32 iterations = 20;
DEBUGP("c: %f\n");
DEBUGP("c: %f\n", c);
brz_config_data_t *brz = (brz_config_data_t *)mph->data;
switch(brz->algo) // validating restrictions over parameter c.
{
case CMPH_BMZ8:
if (c == 0 || c >= 2.0) c = 1;
break;
case CMPH_FCH:
if (c <= 2.0) c = 2.6;
break;
}
brz->c = c;
brz->m = mph->key_source->nkeys;
DEBUGP("m: %u\n", brz->m);
@ -179,6 +192,7 @@ cmph_t *brz_new(cmph_config_t *mph, float c)
brzf->k = brz->k;
brzf->c = brz->c;
brzf->m = brz->m;
brzf->algo = brz->algo;
mphf->data = brzf;
mphf->size = brz->m;
DEBUGP("Successfully generated minimal perfect hash\n");
@ -191,7 +205,7 @@ cmph_t *brz_new(cmph_config_t *mph, float c)
static int brz_gen_mphf(cmph_config_t *mph)
{
cmph_uint32 i, e;
cmph_uint32 i, e, error;
brz_config_data_t *brz = (brz_config_data_t *)mph->data;
cmph_uint32 memory_usage = 0;
cmph_uint32 nkeys_in_buffer = 0;
@ -269,7 +283,7 @@ static int brz_gen_mphf(cmph_config_t *mph)
memory_usage += keylen + sizeof(keylen);
h0 = hash(brz->h0, key, keylen) % brz->k;
if ((brz->size[h0] == MAX_BUCKET_SIZE) || ((brz->c >= 1.0) && (cmph_uint8)(brz->c * brz->size[h0]) < brz->size[h0]))
if ((brz->size[h0] == MAX_BUCKET_SIZE) || (brz->algo == CMPH_BMZ8 && ((brz->c >= 1.0) && (cmph_uint8)(brz->c * brz->size[h0]) < brz->size[h0])))
{
free(buffer);
free(buckets_size);
@ -337,6 +351,7 @@ static int brz_gen_mphf(cmph_config_t *mph)
fwrite(cmph_names[CMPH_BRZ], (cmph_uint32)(strlen(cmph_names[CMPH_BRZ]) + 1), 1, brz->mphf_fd);
fwrite(&(brz->m), sizeof(brz->m), 1, brz->mphf_fd);
fwrite(&(brz->c), sizeof(cmph_float32), 1, brz->mphf_fd);
fwrite(&(brz->algo), sizeof(brz->algo), 1, brz->mphf_fd);
fwrite(&(brz->k), sizeof(cmph_uint32), 1, brz->mphf_fd); // number of MPHFs
fwrite(brz->size, sizeof(cmph_uint8)*(brz->k), 1, brz->mphf_fd);
@ -362,6 +377,7 @@ static int brz_gen_mphf(cmph_config_t *mph)
e = 0;
keys_vd = (cmph_uint8 **)calloc(MAX_BUCKET_SIZE, sizeof(cmph_uint8 *));
nkeys_vd = 0;
error = 0;
while(e < brz->m)
{
i = brz_min_index(buffer_h0, nflushes);
@ -403,18 +419,49 @@ static int brz_gen_mphf(cmph_config_t *mph)
cmph_io_adapter_t *source = NULL;
cmph_config_t *config = NULL;
cmph_t *mphf_tmp = NULL;
bmz8_data_t * bmzf = NULL;
char *bufmphf = NULL;
cmph_uint32 buflenmphf = 0;
// Source of keys
source = cmph_io_byte_vector_adapter(keys_vd, (cmph_uint32)nkeys_vd);
config = cmph_config_new(source);
cmph_config_set_algo(config, CMPH_BMZ8);
cmph_config_set_algo(config, brz->algo);
//cmph_config_set_algo(config, CMPH_BMZ8);
cmph_config_set_graphsize(config, brz->c);
mphf_tmp = cmph_new(config);
bmzf = (bmz8_data_t *)mphf_tmp->data;
bufmphf = brz_copy_partial_mphf(brz, bmzf, cur_bucket, &buflenmphf);
bmzf = NULL;
if (mphf_tmp == NULL)
{
if(mph->verbosity) fprintf(stderr, "ERROR: Can't generate MPHF for bucket %u out of %u\n", cur_bucket + 1, brz->k);
error = 1;
cmph_config_destroy(config);
brz_destroy_keys_vd(keys_vd, nkeys_vd);
cmph_io_byte_vector_adapter_destroy(source);
break;
}
if(mph->verbosity)
{
if (cur_bucket % 1000 == 0)
{
fprintf(stderr, "MPHF for bucket %u out of %u was generated.\n", cur_bucket + 1, brz->k);
}
}
switch(brz->algo)
{
case CMPH_FCH:
{
fch_data_t * fchf = NULL;
fchf = (fch_data_t *)mphf_tmp->data;
bufmphf = brz_copy_partial_fch_mphf(brz, fchf, cur_bucket, &buflenmphf);
}
break;
case CMPH_BMZ8:
{
bmz8_data_t * bmzf = NULL;
bmzf = (bmz8_data_t *)mphf_tmp->data;
bufmphf = brz_copy_partial_bmz8_mphf(brz, bmzf, cur_bucket, &buflenmphf);
}
break;
default: assert(0);
}
fwrite(bufmphf, buflenmphf, 1, brz->mphf_fd);
free(bufmphf);
bufmphf = NULL;
@ -425,11 +472,11 @@ static int brz_gen_mphf(cmph_config_t *mph)
nkeys_vd = 0;
}
}
buffer_manager_destroy(buff_manager);
free(keys_vd);
free(buffer_merge);
free(buffer_h0);
if (error) return 0;
return 1;
}
@ -449,7 +496,29 @@ static void brz_destroy_keys_vd(cmph_uint8 ** keys_vd, cmph_uint8 nkeys)
for(i = 0; i < nkeys; i++) { free(keys_vd[i]); keys_vd[i] = NULL;}
}
static char * brz_copy_partial_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen)
static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fchf, cmph_uint32 index, cmph_uint32 *buflen)
{
cmph_uint32 i = 0;
cmph_uint32 buflenh1 = 0;
cmph_uint32 buflenh2 = 0;
char * bufh1 = NULL;
char * bufh2 = NULL;
char * buf = NULL;
cmph_uint32 n = fchf->b;//brz->size[index];
hash_state_dump(fchf->h1, &bufh1, &buflenh1);
hash_state_dump(fchf->h2, &bufh2, &buflenh2);
*buflen = buflenh1 + buflenh2 + n + 2*sizeof(cmph_uint32);
buf = (char *)malloc(*buflen);
memcpy(buf, &buflenh1, sizeof(cmph_uint32));
memcpy(buf+sizeof(cmph_uint32), bufh1, buflenh1);
memcpy(buf+sizeof(cmph_uint32)+buflenh1, &buflenh2, sizeof(cmph_uint32));
memcpy(buf+2*sizeof(cmph_uint32)+buflenh1, bufh2, buflenh2);
for (i = 0; i < n; i++) memcpy(buf+2*sizeof(cmph_uint32)+buflenh1+buflenh2+i,(fchf->g + i), 1);
free(bufh1);
free(bufh2);
return buf;
}
static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen)
{
cmph_uint32 buflenh1 = 0;
cmph_uint32 buflenh2 = 0;
@ -470,6 +539,8 @@ static char * brz_copy_partial_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf,
free(bufh2);
return buf;
}
int brz_dump(cmph_t *mphf, FILE *fd)
{
brz_data_t *data = (brz_data_t *)mphf->data;
@ -489,9 +560,6 @@ int brz_dump(cmph_t *mphf, FILE *fd)
return 1;
}
void brz_load(FILE *f, cmph_t *mphf)
{
char *buf = NULL;
@ -502,31 +570,41 @@ void brz_load(FILE *f, cmph_t *mphf)
DEBUGP("Loading brz mphf\n");
mphf->data = brz;
fread(&(brz->c), sizeof(cmph_float32), 1, f);
fread(&(brz->algo), sizeof(brz->algo), 1, f); // Reading algo.
fread(&(brz->k), sizeof(cmph_uint32), 1, f);
brz->size = (cmph_uint8 *) malloc(sizeof(cmph_uint8)*brz->k);
fread(brz->size, sizeof(cmph_uint8)*(brz->k), 1, f);
brz->h1 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k);
brz->h2 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k);
brz->g = (cmph_uint8 **) calloc(brz->k, sizeof(cmph_uint8 *));
DEBUGP("Reading %u h1 and %u h2\n", brz->k, brz->k);
DEBUGP("Reading c = %f k = %u algo = %u \n", brz->c, brz->k, brz->algo);
//loading h_i1, h_i2 and g_i.
for(i = 0; i < brz->k; i++)
{
// h1
fread(&buflen, sizeof(cmph_uint32), 1, f);
DEBUGP("Hash state has %u bytes\n", buflen);
DEBUGP("Hash state 1 has %u bytes\n", buflen);
buf = (char *)malloc(buflen);
fread(buf, buflen, 1, f);
brz->h1[i] = hash_state_load(buf, buflen);
free(buf);
//h2
fread(&buflen, sizeof(cmph_uint32), 1, f);
DEBUGP("Hash state has %u bytes\n", buflen);
DEBUGP("Hash state 2 has %u bytes\n", buflen);
buf = (char *)malloc(buflen);
fread(buf, buflen, 1, f);
brz->h2[i] = hash_state_load(buf, buflen);
free(buf);
n = ceil(brz->c * brz->size[i]);
switch(brz->algo)
{
case CMPH_FCH:
n = fch_calc_b(brz->c, brz->size[i]);
break;
case CMPH_BMZ8:
n = ceil(brz->c * brz->size[i]);
break;
default: assert(0);
}
DEBUGP("g_i has %u bytes\n", n);
brz->g[i] = (cmph_uint8 *)calloc(n, sizeof(cmph_uint8));
fread(brz->g[i], sizeof(cmph_uint8)*n, 1, f);
@ -546,9 +624,8 @@ void brz_load(FILE *f, cmph_t *mphf)
return;
}
cmph_uint32 brz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
static cmph_uint32 brz_bmz8_search(brz_data_t *brz, const char *key, cmph_uint32 keylen)
{
brz_data_t *brz = mphf->data;
cmph_uint32 h0 = hash(brz->h0, key, keylen) % brz->k;
cmph_uint32 m = brz->size[h0];
cmph_uint32 n = ceil(brz->c * m);
@ -562,6 +639,35 @@ cmph_uint32 brz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
DEBUGP("Address: %u\n", mphf_bucket + brz->offset[h0]);
return (mphf_bucket + brz->offset[h0]);
}
static cmph_uint32 brz_fch_search(brz_data_t *brz, const char *key, cmph_uint32 keylen)
{
cmph_uint32 h0 = hash(brz->h0, key, keylen) % brz->k;
cmph_uint32 m = brz->size[h0];
cmph_uint32 b = fch_calc_b(brz->c, m);
cmph_float32 p1 = fch_calc_p1(m);
cmph_float32 p2 = fch_calc_p2(b);
cmph_uint32 h1 = hash(brz->h1[h0], key, keylen) % m;
cmph_uint32 h2 = hash(brz->h2[h0], key, keylen) % m;
cmph_uint8 mphf_bucket = 0;
h1 = mixh10h11h12(b, p1, p2, h1);
mphf_bucket = (h2 + brz->g[h0][h1]) % m;
return (mphf_bucket + brz->offset[h0]);
}
cmph_uint32 brz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
{
brz_data_t *brz = mphf->data;
switch(brz->algo)
{
case CMPH_FCH:
return brz_fch_search(brz, key, keylen);
case CMPH_BMZ8:
return brz_bmz8_search(brz, key, keylen);
default: assert(0);
}
return 0;
}
void brz_destroy(cmph_t *mphf)
{
cmph_uint32 i;

View File

@ -5,6 +5,7 @@
struct __brz_data_t
{
CMPH_ALGO algo; // CMPH algo for generating the MPHFs for the buckets (Just CMPH_FCH and CMPH_BMZ8)
cmph_uint32 m; // edges (words) count
cmph_float32 c; // constant c
cmph_uint8 *size; // size[i] stores the number of edges represented by g[i][...].
@ -19,6 +20,7 @@ struct __brz_data_t
struct __brz_config_data_t
{
CMPH_HASH hashfuncs[3];
CMPH_ALGO algo; // CMPH algo for generating the MPHFs for the buckets (Just CMPH_FCH and CMPH_BMZ8)
cmph_float32 c; // constant c
cmph_uint32 m; // edges (words) count
cmph_uint8 *size; // size[i] stores the number of edges represented by g[i][...].

View File

@ -60,6 +60,7 @@ cmph_t *chm_new(cmph_config_t *mph, float c)
cmph_uint8 *visited = NULL;
chm_config_data_t *chm = (chm_config_data_t *)mph->data;
chm->m = mph->key_source->nkeys;
if (c == 0) c = 2.09;
chm->n = ceil(c * mph->key_source->nkeys);
DEBUGP("m (edges): %u n (vertices): %u c: %f\n", chm->m, chm->n, c);
chm->graph = graph_new(chm->n, chm->m);

View File

@ -354,27 +354,22 @@ cmph_t *cmph_new(cmph_config_t *mph)
{
case CMPH_CHM:
DEBUGP("Creating chm hash\n");
if (c == 0) c = 2.09;
mphf = chm_new(mph, c);
break;
case CMPH_BMZ: /* included -- Fabiano */
DEBUGP("Creating bmz hash\n");
if (c == 0) c = 1.15;
mphf = bmz_new(mph, c);
break;
case CMPH_BMZ8: /* included -- Fabiano */
DEBUGP("Creating bmz8 hash\n");
if (c == 0) c = 1.15;
mphf = bmz8_new(mph, c);
break;
case CMPH_BRZ: /* included -- Fabiano */
DEBUGP("Creating brz hash\n");
if (c == 0) c = 1.15;
mphf = brz_new(mph, c);
break;
case CMPH_FCH: /* included -- Fabiano */
DEBUGP("Creating fch hash\n");
if (c <= 2) c = 2.6;
mphf = fch_new(mph, c);
break;
default:

View File

@ -13,8 +13,6 @@
//#define DEBUG
#include "debug.h"
static cmph_uint32 mixh10h11h12(cmph_uint32 b, cmph_float32 p1, cmph_float32 p2, cmph_uint32 initial_index);
static void calc_parameters(fch_config_data_t *fch);
static fch_buckets_t * mapping(cmph_config_t *mph);
static cmph_uint32 * ordering(fch_buckets_t * buckets);
static cmph_uint8 check_for_collisions_h2(fch_config_data_t *fch, fch_buckets_t * buckets, cmph_uint32 *sorted_indexes);
@ -57,7 +55,7 @@ void fch_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
}
}
static cmph_uint32 mixh10h11h12(cmph_uint32 b, cmph_float32 p1, cmph_float32 p2, cmph_uint32 initial_index)
cmph_uint32 mixh10h11h12(cmph_uint32 b, cmph_float32 p1, cmph_float32 p2, cmph_uint32 initial_index)
{
if (initial_index < p1) initial_index %= (cmph_uint32)p2; /* h11 o h10 */
else { /* h12 o h10 */
@ -67,11 +65,20 @@ static cmph_uint32 mixh10h11h12(cmph_uint32 b, cmph_float32 p1, cmph_float32 p2,
return initial_index;
}
static void calc_parameters(fch_config_data_t *fch)
cmph_uint32 fch_calc_b(cmph_float32 c, cmph_uint32 m)
{
fch->b = (cmph_uint32)ceil((fch->c*fch->m)/(log(fch->m)/log(2) + 1));
fch->p1 = ceil(0.55*fch->m);
fch->p2 = ceil(0.3*fch->b);
return (cmph_uint32)ceil((c*m)/(log(m)/log(2) + 1));
}
cmph_float32 fch_calc_p1(cmph_uint32 m)
{
return ceil(0.55*m);
}
cmph_float32 fch_calc_p2(cmph_uint32 b)
{
return ceil(0.3*b);
}
static fch_buckets_t * mapping(cmph_config_t *mph)
@ -81,7 +88,9 @@ static fch_buckets_t * mapping(cmph_config_t *mph)
fch_config_data_t *fch = (fch_config_data_t *)mph->data;
if (fch->h1) hash_state_destroy(fch->h1);
fch->h1 = hash_state_new(fch->hashfuncs[0], fch->m);
calc_parameters (fch);
fch->b = fch_calc_b(fch->c, fch->m);
fch->p1 = fch_calc_p1(fch->m);
fch->p2 = fch_calc_p2(fch->b);
//DEBUGP("b:%u p1:%f p2:%f\n", fch->b, fch->p1, fch->p2);
buckets = fch_buckets_new(fch->b);
@ -247,6 +256,7 @@ cmph_t *fch_new(cmph_config_t *mph, float c)
fch_config_data_t *fch = (fch_config_data_t *)mph->data;
fch->m = mph->key_source->nkeys;
//DEBUGP("m: %f\n", fch->m);
if (c <= 2) c = 2.6; // validating restrictions over parameter c.
fch->c = c;
//DEBUGP("c: %f\n", fch->c);
fch->h1 = NULL;
@ -389,7 +399,6 @@ cmph_uint32 fch_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
fch_data_t *fch = mphf->data;
cmph_uint32 h1 = hash(fch->h1, key, keylen) % fch->m;
cmph_uint32 h2 = hash(fch->h2, key, keylen) % fch->m;
h1 = hash(fch->h1, key, keylen) % fch->m;
h1 = mixh10h11h12 (fch->b, fch->p1, fch->p2, h1);
//DEBUGP("key: %s h1: %u h2: %u g[h1]: %u\n", key, h1, h2, fch->g[h1]);
return (h2 + fch->g[h1]) % fch->m;

View File

@ -6,6 +6,12 @@
typedef struct __fch_data_t fch_data_t;
typedef struct __fch_config_data_t fch_config_data_t;
/* Parameters calculation */
cmph_uint32 fch_calc_b(cmph_float32 c, cmph_uint32 m);
cmph_float32 fch_calc_p1(cmph_uint32 m);
cmph_float32 fch_calc_p2(cmph_uint32 b);
cmph_uint32 mixh10h11h12(cmph_uint32 b, cmph_float32 p1, cmph_float32 p2, cmph_uint32 initial_index);
fch_config_data_t *fch_config_new();
void fch_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs);
void fch_config_destroy(cmph_config_t *mph);

View File

@ -217,7 +217,6 @@ int main(int argc, char **argv)
if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd);
else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys);
if (generate)
{
//Create mphf
@ -230,7 +229,8 @@ int main(int argc, char **argv)
cmph_config_set_mphf_fd(config, mphf_fd);
cmph_config_set_memory_availability(config, memory_availability);
cmph_config_set_b(config, b);
if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15;
//if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15;
if(mph_algo == CMPH_BMZ && c >= 2.0) c=1.15;
if (c != 0) cmph_config_set_graphsize(config, c);
mphf = cmph_new(config);
cmph_config_destroy(config);