Remaining part of the fix for bug 3465649. This one fixes both BRZ and

CHD_PH for small key sets.
This commit is contained in:
Fabiano C. Botelho 2013-04-20 01:28:00 -07:00
parent 1b2a7cedff
commit 9f999ef428
7 changed files with 112 additions and 17 deletions

View File

@ -1,4 +1,4 @@
noinst_PROGRAMS = vector_adapter_ex1 file_adapter_ex2 struct_vector_adapter_ex3 small_set_test noinst_PROGRAMS = vector_adapter_ex1 file_adapter_ex2 struct_vector_adapter_ex3 small_set_ex4
INCLUDES = -I../src/ INCLUDES = -I../src/
@ -11,5 +11,5 @@ file_adapter_ex2_SOURCES = file_adapter_ex2.c
struct_vector_adapter_ex3_LDADD = ../src/libcmph.la struct_vector_adapter_ex3_LDADD = ../src/libcmph.la
struct_vector_adapter_ex3_SOURCES = struct_vector_adapter_ex3.c struct_vector_adapter_ex3_SOURCES = struct_vector_adapter_ex3.c
small_set_test_LDADD = ../src/libcmph.la small_set_ex4_LDADD = ../src/libcmph.la
small_set_test_SOURCES = small_set_test.c small_set_ex4_SOURCES = small_set_ex4.c

View File

@ -6,6 +6,8 @@ int test(cmph_uint32* items_to_hash, cmph_uint32 items_len, CMPH_ALGO alg_n)
cmph_config_t *config; cmph_config_t *config;
cmph_io_adapter_t *source; cmph_io_adapter_t *source;
cmph_uint32 i; cmph_uint32 i;
char filename[256];
FILE* mphf_fd = NULL;
printf("%s (%u)\n", cmph_names[alg_n], alg_n); printf("%s (%u)\n", cmph_names[alg_n], alg_n);
@ -16,9 +18,21 @@ int test(cmph_uint32* items_to_hash, cmph_uint32 items_len, CMPH_ALGO alg_n)
items_len); items_len);
config = cmph_config_new(source); config = cmph_config_new(source);
cmph_config_set_algo(config, alg_n); cmph_config_set_algo(config, alg_n);
if (alg_n == CMPH_BRZ) {
sprintf(filename, "%s_%u.mph", cmph_names[alg_n], items_len);
mphf_fd = fopen(filename, "w");
cmph_config_set_mphf_fd(config, mphf_fd);
}
hash = cmph_new(config); hash = cmph_new(config);
cmph_config_destroy(config); cmph_config_destroy(config);
if (alg_n == CMPH_BRZ) {
cmph_dump(hash, mphf_fd);
cmph_destroy(hash);
fclose(mphf_fd);
mphf_fd = fopen(filename, "r");
hash = cmph_load(mphf_fd);
}
printf("packed_size %u\n",cmph_packed_size(hash)); printf("packed_size %u\n",cmph_packed_size(hash));
for (i=0; i<items_len; ++i) for (i=0; i<items_len; ++i)
@ -30,7 +44,11 @@ int test(cmph_uint32* items_to_hash, cmph_uint32 items_len, CMPH_ALGO alg_n)
printf("\n"); printf("\n");
cmph_io_vector_adapter_destroy(source); cmph_io_vector_adapter_destroy(source);
cmph_destroy(hash); cmph_destroy(hash);
if (alg_n == CMPH_BRZ) {
fclose(mphf_fd);
}
return 0; return 0;
} }
@ -43,6 +61,8 @@ int main (void)
cmph_uint32 vec2_len = 2; cmph_uint32 vec2_len = 2;
cmph_uint32 vec3[] = {2184764, 1882984, 1170551}; // CMPH_CHD_PH, CMPH_CHD (7,8) cmph_uint32 vec3[] = {2184764, 1882984, 1170551}; // CMPH_CHD_PH, CMPH_CHD (7,8)
cmph_uint32 vec3_len = 3; cmph_uint32 vec3_len = 3;
cmph_uint32 vec4[] = {2184764}; // CMPH_CHD_PH, CMPH_CHD (7,8)
cmph_uint32 vec4_len = 1;
cmph_uint32 i; cmph_uint32 i;
// Testing with vec1 // Testing with vec1
@ -63,7 +83,7 @@ int main (void)
test(values, length, i); test(values, length, i);
} }
// Testing with vec2 // Testing with vec3
values = (cmph_uint32*)vec3; values = (cmph_uint32*)vec3;
length = vec3_len; length = vec3_len;
printf("TESTING VECTOR WITH %u INTEGERS\n", length); printf("TESTING VECTOR WITH %u INTEGERS\n", length);
@ -72,5 +92,14 @@ int main (void)
test(values, length, i); test(values, length, i);
} }
// Testing with vec4
values = (cmph_uint32*)vec4;
length = vec4_len;
printf("TESTING VECTOR WITH %u INTEGERS\n", length);
for (i = 0; i < CMPH_COUNT; i++)
{
test(values, length, i);
}
return 0; return 0;
} }

View File

@ -70,6 +70,12 @@ cmph_t *bmz_new(cmph_config_t *mph, double c)
DEBUGP("c: %f\n", c); DEBUGP("c: %f\n", c);
bmz->m = mph->key_source->nkeys; bmz->m = mph->key_source->nkeys;
bmz->n = (cmph_uint32)ceil(c * mph->key_source->nkeys); bmz->n = (cmph_uint32)ceil(c * mph->key_source->nkeys);
if (bmz->n < 5) // workaround for small key sets
{
bmz->n = 5;
}
DEBUGP("m (edges): %u n (vertices): %u c: %f\n", bmz->m, bmz->n, c); DEBUGP("m (edges): %u n (vertices): %u c: %f\n", bmz->m, bmz->n, c);
bmz->graph = graph_new(bmz->n, bmz->m); bmz->graph = graph_new(bmz->n, bmz->m);
DEBUGP("Created graph\n"); DEBUGP("Created graph\n");

View File

@ -74,6 +74,12 @@ cmph_t *bmz8_new(cmph_config_t *mph, double c)
DEBUGP("c: %f\n", c); DEBUGP("c: %f\n", c);
bmz8->m = (cmph_uint8) mph->key_source->nkeys; bmz8->m = (cmph_uint8) mph->key_source->nkeys;
bmz8->n = (cmph_uint8) ceil(c * mph->key_source->nkeys); bmz8->n = (cmph_uint8) ceil(c * mph->key_source->nkeys);
if (bmz8->n < 5) // workaround for small key sets
{
bmz8->n = 5;
}
DEBUGP("m (edges): %u n (vertices): %u c: %f\n", bmz8->m, bmz8->n, c); DEBUGP("m (edges): %u n (vertices): %u c: %f\n", bmz8->m, bmz8->n, c);
bmz8->graph = graph_new(bmz8->n, bmz8->m); bmz8->graph = graph_new(bmz8->n, bmz8->m);
DEBUGP("Created graph\n"); DEBUGP("Created graph\n");

View File

@ -27,9 +27,9 @@ static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * b
brz_config_data_t *brz_config_new(void) brz_config_data_t *brz_config_new(void)
{ {
brz_config_data_t *brz = NULL; brz_config_data_t *brz = NULL;
brz = (brz_config_data_t *)malloc(sizeof(brz_config_data_t)); brz = (brz_config_data_t *)malloc(sizeof(brz_config_data_t));
if (!brz) return NULL; if (!brz) return NULL;
brz->algo = CMPH_FCH; brz->algo = CMPH_FCH;
brz->b = 128; brz->b = 128;
brz->hashfuncs[0] = CMPH_HASH_JENKINS; brz->hashfuncs[0] = CMPH_HASH_JENKINS;
brz->hashfuncs[1] = CMPH_HASH_JENKINS; brz->hashfuncs[1] = CMPH_HASH_JENKINS;
@ -131,6 +131,15 @@ cmph_t *brz_new(cmph_config_t *mph, double c)
DEBUGP("c: %f\n", c); DEBUGP("c: %f\n", c);
brz_config_data_t *brz = (brz_config_data_t *)mph->data; brz_config_data_t *brz = (brz_config_data_t *)mph->data;
// Since we keep dumping partial pieces of the MPHF as it gets created
// the caller must set the file to store the resulting MPHF before calling
// this function.
if (brz->mphf_fd == NULL)
{
return NULL;
}
switch(brz->algo) // validating restrictions over parameter c. switch(brz->algo) // validating restrictions over parameter c.
{ {
case CMPH_BMZ8: case CMPH_BMZ8:
@ -144,6 +153,11 @@ cmph_t *brz_new(cmph_config_t *mph, double c)
} }
brz->c = c; brz->c = c;
brz->m = mph->key_source->nkeys; brz->m = mph->key_source->nkeys;
if (brz->m < 5)
{
brz->c = 5;
}
DEBUGP("m: %u\n", brz->m); DEBUGP("m: %u\n", brz->m);
brz->k = (cmph_uint32)ceil(brz->m/((double)brz->b)); brz->k = (cmph_uint32)ceil(brz->m/((double)brz->b));
DEBUGP("k: %u\n", brz->k); DEBUGP("k: %u\n", brz->k);
@ -364,7 +378,7 @@ static int brz_gen_mphf(cmph_config_t *mph)
{ {
fprintf(stderr, "\nMPHF generation \n"); fprintf(stderr, "\nMPHF generation \n");
} }
/* Starting to dump to disk the resultant MPHF: __cmph_dump function */ /* Starting to dump to disk the resulting MPHF: __cmph_dump function */
nbytes = fwrite(cmph_names[CMPH_BRZ], (size_t)(strlen(cmph_names[CMPH_BRZ]) + 1), (size_t)1, brz->mphf_fd); nbytes = fwrite(cmph_names[CMPH_BRZ], (size_t)(strlen(cmph_names[CMPH_BRZ]) + 1), (size_t)1, brz->mphf_fd);
nbytes = fwrite(&(brz->m), sizeof(brz->m), (size_t)1, brz->mphf_fd); nbytes = fwrite(&(brz->m), sizeof(brz->m), (size_t)1, brz->mphf_fd);
nbytes = fwrite(&(brz->c), sizeof(double), (size_t)1, brz->mphf_fd); nbytes = fwrite(&(brz->c), sizeof(double), (size_t)1, brz->mphf_fd);
@ -442,7 +456,7 @@ static int brz_gen_mphf(cmph_config_t *mph)
source = cmph_io_byte_vector_adapter(keys_vd, (cmph_uint32)nkeys_vd); source = cmph_io_byte_vector_adapter(keys_vd, (cmph_uint32)nkeys_vd);
config = cmph_config_new(source); config = cmph_config_new(source);
cmph_config_set_algo(config, brz->algo); cmph_config_set_algo(config, brz->algo);
//cmph_config_set_algo(config, CMPH_BMZ8); cmph_config_set_hashfuncs(config, brz->hashfuncs);
cmph_config_set_graphsize(config, brz->c); cmph_config_set_graphsize(config, brz->c);
mphf_tmp = cmph_new(config); mphf_tmp = cmph_new(config);
if (mphf_tmp == NULL) if (mphf_tmp == NULL)
@ -565,7 +579,7 @@ int brz_dump(cmph_t *mphf, FILE *fd)
cmph_uint32 buflen; cmph_uint32 buflen;
register size_t nbytes; register size_t nbytes;
DEBUGP("Dumping brzf\n"); DEBUGP("Dumping brzf\n");
// The initial part of the MPHF have already been dumped to disk during construction // The initial part of the MPHF has already been dumped to disk during construction
// Dumping h0 // Dumping h0
hash_state_dump(data->h0, &buf, &buflen); hash_state_dump(data->h0, &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
@ -730,7 +744,13 @@ void brz_pack(cmph_t *mphf, void *packed_mphf)
brz_data_t *data = (brz_data_t *)mphf->data; brz_data_t *data = (brz_data_t *)mphf->data;
cmph_uint8 * ptr = (cmph_uint8 *)packed_mphf; cmph_uint8 * ptr = (cmph_uint8 *)packed_mphf;
cmph_uint32 i,n; cmph_uint32 i,n;
// This assumes that if one function pointer is NULL,
// all the others will be as well.
if (data->h1 == NULL)
{
return;
}
// packing internal algo type // packing internal algo type
memcpy(ptr, &(data->algo), sizeof(data->algo)); memcpy(ptr, &(data->algo), sizeof(data->algo));
ptr += sizeof(data->algo); ptr += sizeof(data->algo);
@ -821,9 +841,21 @@ cmph_uint32 brz_packed_size(cmph_t *mphf)
cmph_uint32 i; cmph_uint32 i;
cmph_uint32 size = 0; cmph_uint32 size = 0;
brz_data_t *data = (brz_data_t *)mphf->data; brz_data_t *data = (brz_data_t *)mphf->data;
CMPH_HASH h0_type = hash_get_type(data->h0); CMPH_HASH h0_type;
CMPH_HASH h1_type = hash_get_type(data->h1[0]); CMPH_HASH h1_type;
CMPH_HASH h2_type = hash_get_type(data->h2[0]); CMPH_HASH h2_type;
// This assumes that if one function pointer is NULL,
// all the others will be as well.
if (data->h1 == NULL)
{
return 0U;
}
h0_type = hash_get_type(data->h0);
h1_type = hash_get_type(data->h1[0]);
h2_type = hash_get_type(data->h2[0]);
size = (cmph_uint32)(2*sizeof(CMPH_ALGO) + 3*sizeof(CMPH_HASH) + hash_state_packed_size(h0_type) + sizeof(cmph_uint32) + size = (cmph_uint32)(2*sizeof(CMPH_ALGO) + 3*sizeof(CMPH_HASH) + hash_state_packed_size(h0_type) + sizeof(cmph_uint32) +
sizeof(double) + sizeof(cmph_uint8)*data->k + sizeof(cmph_uint32)*data->k); sizeof(double) + sizeof(cmph_uint8)*data->k + sizeof(cmph_uint32)*data->k);
// pointers to g_is // pointers to g_is

View File

@ -3,6 +3,21 @@
#include "cmph.h" #include "cmph.h"
/*
* The BRZ algorithm has been built so to consume the bare minimum
* amount of memory to generate the MPHFs. Thereby we decided
* to dump the resulting MPHFs to disk while creating them. Thus,
* to use the BRZ algorithm, one has to call brz_config_set_mphf_fd
* before calling brz_new. Otherwise we will fail the MPHF creation.
* One side effect of this design decision is that the resulting
* MPHF cannot be used until its dumping process is finalized
* by calling brz_dump and the caller must use brz_load before
* any call to either one of the following functions is made:
* brz_search
* brz_pack
* brz_packed_size
* brz_search_packed
*/
typedef struct __brz_data_t brz_data_t; typedef struct __brz_data_t brz_data_t;
typedef struct __brz_config_data_t brz_config_data_t; typedef struct __brz_config_data_t brz_config_data_t;

View File

@ -627,7 +627,8 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c)
register double load_factor = c; register double load_factor = c;
register cmph_uint8 searching_success = 0; register cmph_uint8 searching_success = 0;
register cmph_uint32 max_probes = 1 << 20; // default value for max_probes register cmph_uint32 max_probes_default = 1 << 20; // default value for max_probes
register cmph_uint32 max_probes;
register cmph_uint32 iterations = 100; register cmph_uint32 iterations = 100;
chd_ph_bucket_t * buckets = NULL; chd_ph_bucket_t * buckets = NULL;
chd_ph_item_t * items = NULL; chd_ph_item_t * items = NULL;
@ -688,7 +689,13 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c)
buckets = chd_ph_bucket_new(chd_ph->nbuckets); buckets = chd_ph_bucket_new(chd_ph->nbuckets);
items = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t)); items = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t));
max_probes = (cmph_uint32)(((log(chd_ph->m)/log(2))/20) * max_probes); max_probes = (cmph_uint32)((log(chd_ph->m)/log(2))/20);
if (max_probes == 0) {
max_probes = max_probes_default;
} else {
max_probes = max_probes * max_probes_default;
}
if(chd_ph->keys_per_bin == 1) if(chd_ph->keys_per_bin == 1)
chd_ph->occup_table = (cmph_uint8 *) calloc(((chd_ph->n + 31)/32), sizeof(cmph_uint32)); chd_ph->occup_table = (cmph_uint8 *) calloc(((chd_ph->n + 31)/32), sizeof(cmph_uint32));