From 59ddeb6379dc47469bde475b1e30b7dc5f295e4b Mon Sep 17 00:00:00 2001 From: fc_botelho Date: Wed, 25 Jan 2006 19:45:14 +0000 Subject: [PATCH] stable version of BRZ algorithm using buffers --- src/Makefile.am | 6 +- src/bmz.c | 11 +- src/bmz8.c | 23 +-- src/brz.c | 364 +++++++++++++++++++++---------------------- src/brz.h | 2 + src/brz_structs.h | 7 +- src/cmph.c | 51 +++++- src/cmph.h | 2 + src/jenkins_hash.c | 12 +- src/jenkins_hash.h | 2 - src/main.c | 39 +++-- src/vqueue.c | 2 +- vldb/pt/figs/brz.fig | 204 +++++++++++++----------- 13 files changed, 398 insertions(+), 327 deletions(-) diff --git a/src/Makefile.am b/src/Makefile.am index 1d05357..6eb41d8 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,7 +1,7 @@ bin_PROGRAMS = cmph lib_LTLIBRARIES = libcmph.la include_HEADERS = cmph.h cmph_types.h -libcmph_la_SOURCES = debug.h\ +libcmph_la_SOURCES = util.h debug.h\ bitbool.h bitbool.c\ cmph_types.h\ hash.h hash_state.h hash.c\ @@ -17,9 +17,11 @@ libcmph_la_SOURCES = debug.h\ chm.h chm_structs.h chm.c\ bmz.h bmz_structs.h bmz.c\ bmz8.h bmz8_structs.h bmz8.c\ + buffer_manage.h buffer_manage.c\ + buffer_entry.h buffer_entry.c\ brz.h brz_structs.h brz.c libcmph_la_LDFLAGS = -version-info 0:0:0 -cmph_SOURCES = main.c ../wingetopt.h ../wingetopt.c +cmph_SOURCES = main.c wingetopt.h wingetopt.c cmph_LDADD = libcmph.la diff --git a/src/bmz.c b/src/bmz.c index 2fba0c2..be31b1a 100644 --- a/src/bmz.c +++ b/src/bmz.c @@ -64,7 +64,7 @@ cmph_t *bmz_new(cmph_config_t *mph, float c) cmph_uint8 *used_edges = NULL; cmph_uint8 restart_mapping = 0; cmph_uint8 * visited = NULL; - + bmz_config_data_t *bmz = (bmz_config_data_t *)mph->data; DEBUGP("c: %f\n", c); bmz->m = mph->key_source->nkeys; @@ -93,7 +93,7 @@ cmph_t *bmz_new(cmph_config_t *mph, float c) bmz->hashes[0] = hash_state_new(bmz->hashfuncs[0], bmz->n); DEBUGP("hash function 2\n"); bmz->hashes[1] = hash_state_new(bmz->hashfuncs[1], bmz->n); - DEBUGP("Generating edges\n"); + DEBUGP("Generating edges\n"); ok = bmz_gen_edges(mph); if (!ok) { @@ -109,20 +109,18 @@ cmph_t *bmz_new(cmph_config_t *mph, float c) } if (iterations == 0) break; } - else break; + else break; } if (iterations == 0) { graph_destroy(bmz->graph); return NULL; } - // Ordering step if (mph->verbosity) { fprintf(stderr, "Starting ordering step\n"); } - graph_obtain_critical_nodes(bmz->graph); // Searching step @@ -164,7 +162,7 @@ cmph_t *bmz_new(cmph_config_t *mph, float c) free(used_edges); free(visited); }while(restart_mapping && iterations_map > 0); - graph_destroy(bmz->graph); + graph_destroy(bmz->graph); bmz->graph = NULL; if (iterations_map == 0) { @@ -181,6 +179,7 @@ cmph_t *bmz_new(cmph_config_t *mph, float c) bmzf->m = bmz->m; mphf->data = bmzf; mphf->size = bmz->m; + DEBUGP("Successfully generated minimal perfect hash\n"); if (mph->verbosity) { diff --git a/src/bmz8.c b/src/bmz8.c index c4837c2..0ebb3ce 100644 --- a/src/bmz8.c +++ b/src/bmz8.c @@ -5,7 +5,6 @@ #include "hash.h" #include "vqueue.h" #include "bitbool.h" - #include #include #include @@ -66,7 +65,6 @@ cmph_t *bmz8_new(cmph_config_t *mph, float c) cmph_uint8 * visited = NULL; bmz8_config_data_t *bmz8 = (bmz8_config_data_t *)mph->data; - if (mph->key_source->nkeys >= 256) { if (mph->verbosity) fprintf(stderr, "The number of keys in BMZ8 must be lower than 256.\n"); @@ -168,9 +166,11 @@ cmph_t *bmz8_new(cmph_config_t *mph, float c) iterations_map--; if (mph->verbosity) fprintf(stderr, "Restarting mapping step. %u iterations remaining.\n", iterations_map); } + free(used_edges); free(visited); - }while(restart_mapping && iterations_map > 0); + + }while(restart_mapping && iterations_map > 0); graph_destroy(bmz8->graph); bmz8->graph = NULL; if (iterations_map == 0) @@ -266,8 +266,8 @@ static cmph_uint8 bmz8_traverse_critical_nodes(bmz8_config_data_t *bmz8, cmph_ui static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz8, cmph_uint8 v, cmph_uint8 * biggest_g_value, cmph_uint8 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited) { cmph_uint8 next_g; - cmph_uint32 u; /* Auxiliary vertex */ - cmph_uint32 lav; /* lookahead vertex */ + cmph_uint32 u; + cmph_uint32 lav; cmph_uint8 collision; cmph_uint8 * unused_g_values = NULL; cmph_uint8 unused_g_values_capacity = 0; @@ -278,7 +278,7 @@ static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz DEBUGP("Labelling critical vertices\n"); bmz8->g[v] = (cmph_uint8)ceil ((double)(*biggest_edge_value)/2) - 1; SETBIT(visited, v); - next_g = (cmph_uint8)floor((double)(*biggest_edge_value/2)); /* next_g is incremented in the do..while statement*/ + next_g = (cmph_uint8)floor((double)(*biggest_edge_value/2)); vqueue_insert(q, v); while(!vqueue_is_empty(q)) { @@ -324,14 +324,15 @@ static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz { if(nunused_g_values == unused_g_values_capacity) { - unused_g_values = realloc(unused_g_values, (unused_g_values_capacity + BUFSIZ)*sizeof(cmph_uint8)); + unused_g_values = realloc(unused_g_values, (unused_g_values_capacity + BUFSIZ)*sizeof(cmph_uint8)); unused_g_values_capacity += BUFSIZ; } unused_g_values[nunused_g_values++] = next_g; } if (next_g > *biggest_g_value) *biggest_g_value = next_g; - } + } + next_g_index--; if (next_g_index < nunused_g_values) unused_g_values[next_g_index] = unused_g_values[--nunused_g_values]; @@ -345,9 +346,11 @@ static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz if(next_g + bmz8->g[lav] > *biggest_edge_value) *biggest_edge_value = next_g + bmz8->g[lav]; } } + bmz8->g[u] = next_g; // Labelling vertex u. SETBIT(visited, u); - vqueue_insert(q, u); + vqueue_insert(q, u); + } } @@ -537,7 +540,7 @@ cmph_uint8 bmz8_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) void bmz8_destroy(cmph_t *mphf) { bmz8_data_t *data = (bmz8_data_t *)mphf->data; - free(data->g); + free(data->g); hash_state_destroy(data->hashes[0]); hash_state_destroy(data->hashes[1]); free(data->hashes); diff --git a/src/brz.c b/src/brz.c index 44dcaa7..10d0367 100755 --- a/src/brz.c +++ b/src/brz.c @@ -4,10 +4,10 @@ #include "brz.h" #include "cmph_structs.h" #include "brz_structs.h" +#include "buffer_manage.h" #include "cmph.h" #include "hash.h" #include "bitbool.h" - #include #include #include @@ -21,12 +21,14 @@ static int brz_gen_graphs(cmph_config_t *mph); static cmph_uint32 brz_min_index(cmph_uint32 * vector, cmph_uint32 n); static char * brz_read_key(FILE * fd); static void brz_destroy_keys_vd(char ** keys_vd, cmph_uint8 nkeys); -static void brz_copy_partial_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_io_adapter_t *source); +static char * brz_copy_partial_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen); +//static void brz_copy_partial_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index); static void brz_flush_g(brz_config_data_t *brz, cmph_uint32 *start_index, FILE * fd); brz_config_data_t *brz_config_new() { brz_config_data_t *brz = NULL; brz = (brz_config_data_t *)malloc(sizeof(brz_config_data_t)); + brz->b = 128; brz->hashfuncs[0] = CMPH_HASH_JENKINS; brz->hashfuncs[1] = CMPH_HASH_JENKINS; brz->hashfuncs[2] = CMPH_HASH_JENKINS; @@ -35,10 +37,11 @@ brz_config_data_t *brz_config_new() brz->g = NULL; brz->h1 = NULL; brz->h2 = NULL; - brz->h3 = NULL; + brz->h0 = NULL; brz->memory_availability = 1024*1024; brz->tmp_dir = (cmph_uint8 *)calloc(10, sizeof(cmph_uint8)); - strcpy(brz->tmp_dir, "/var/tmp/\0"); + brz->mphf_fd = NULL; + strcpy((char *)(brz->tmp_dir), "/var/tmp/"); assert(brz); return brz; } @@ -46,6 +49,7 @@ brz_config_data_t *brz_config_new() void brz_config_destroy(cmph_config_t *mph) { brz_config_data_t *data = (brz_config_data_t *)mph->data; + free(data->tmp_dir); DEBUGP("Destroying algorithm dependent data\n"); free(data); } @@ -74,22 +78,35 @@ void brz_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir) brz_config_data_t *brz = (brz_config_data_t *)mph->data; if(tmp_dir) { - cmph_uint32 len = strlen(tmp_dir); + cmph_uint32 len = strlen((char *)tmp_dir); free(brz->tmp_dir); if(tmp_dir[len-1] != '/') { brz->tmp_dir = calloc(len+2, sizeof(cmph_uint8)); - sprintf(brz->tmp_dir, "%s/", tmp_dir); + sprintf((char *)(brz->tmp_dir), "%s/", (char *)tmp_dir); } else { brz->tmp_dir = calloc(len+1, sizeof(cmph_uint8)); - sprintf(brz->tmp_dir, "%s", tmp_dir); + sprintf((char *)(brz->tmp_dir), "%s", (char *)tmp_dir); } } } +void brz_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd) +{ + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + brz->mphf_fd = mphf_fd; + assert(brz->mphf_fd); +} + +void brz_config_set_b(cmph_config_t *mph, cmph_uint8 b) +{ + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + brz->b = b; +} + cmph_t *brz_new(cmph_config_t *mph, float c) { cmph_t *mphf = NULL; @@ -102,7 +119,7 @@ cmph_t *brz_new(cmph_config_t *mph, float c) brz->c = c; brz->m = mph->key_source->nkeys; DEBUGP("m: %u\n", brz->m); - brz->k = ceil(brz->m/170); + brz->k = ceil(brz->m/(brz->b)); DEBUGP("k: %u\n", brz->k); brz->size = (cmph_uint8 *) calloc(brz->k, sizeof(cmph_uint8)); @@ -112,22 +129,22 @@ cmph_t *brz_new(cmph_config_t *mph, float c) fprintf(stderr, "Partioning the set of keys.\n"); } - brz->h1 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k); - brz->h2 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k); - brz->g = (cmph_uint8 **) malloc(sizeof(cmph_uint8 *) *brz->k); +// brz->h1 = (hash_state_t **)calloc(brz->k, sizeof(hash_state_t *)); +// brz->h2 = (hash_state_t **)calloc(brz->k, sizeof(hash_state_t *)); +// brz->g = (cmph_uint8 **) calloc(brz->k, sizeof(cmph_uint8 *)); while(1) { int ok; DEBUGP("hash function 3\n"); - brz->h3 = hash_state_new(brz->hashfuncs[2], brz->k); + brz->h0 = hash_state_new(brz->hashfuncs[2], brz->k); DEBUGP("Generating graphs\n"); ok = brz_gen_graphs(mph); if (!ok) { --iterations; - hash_state_destroy(brz->h3); - brz->h3 = NULL; + hash_state_destroy(brz->h0); + brz->h0 = NULL; DEBUGP("%u iterations remaining to create the graphs in a external file\n", iterations); if (mph->verbosity) { @@ -150,7 +167,6 @@ cmph_t *brz_new(cmph_config_t *mph, float c) { brz->offset[i] = brz->size[i-1] + brz->offset[i-1]; } - // Generating a mphf mphf = (cmph_t *)malloc(sizeof(cmph_t)); mphf->algo = mph->algo; @@ -161,14 +177,12 @@ cmph_t *brz_new(cmph_config_t *mph, float c) brz->h1 = NULL; //transfer memory ownership brzf->h2 = brz->h2; brz->h2 = NULL; //transfer memory ownership - brzf->h3 = brz->h3; - brz->h3 = NULL; //transfer memory ownership + brzf->h0 = brz->h0; + brz->h0 = NULL; //transfer memory ownership brzf->size = brz->size; brz->size = NULL; //transfer memory ownership brzf->offset = brz->offset; brz->offset = NULL; //transfer memory ownership - brzf->tmp_dir = brz->tmp_dir; - brz->tmp_dir = NULL; //transfer memory ownership brzf->k = brz->k; brzf->c = brz->c; brzf->m = brz->m; @@ -186,47 +200,42 @@ static int brz_gen_graphs(cmph_config_t *mph) { cmph_uint32 i, e; brz_config_data_t *brz = (brz_config_data_t *)mph->data; - //cmph_uint32 memory_availability = 200*1024*1024; cmph_uint32 memory_usage = 0; cmph_uint32 nkeys_in_buffer = 0; cmph_uint8 *buffer = (cmph_uint8 *)malloc(brz->memory_availability); - cmph_uint32 *buckets_size = (cmph_uint32 *)calloc(brz->k, sizeof(cmph_uint32)); + cmph_uint32 *buckets_size = (cmph_uint32 *)calloc(brz->k, sizeof(cmph_uint32)); cmph_uint32 *keys_index = NULL; cmph_uint8 **buffer_merge = NULL; - cmph_uint32 *buffer_h3 = NULL; + cmph_uint32 *buffer_h0 = NULL; cmph_uint32 nflushes = 0; - cmph_uint32 h3; + cmph_uint32 h0; FILE * tmp_fd = NULL; - FILE ** tmp_fds = NULL; + buffer_manage_t * buff_manage = NULL; char *filename = NULL; char *key = NULL; cmph_uint32 keylen; - cmph_uint32 max_size = 0; cmph_uint32 cur_bucket = 0; cmph_uint8 nkeys_vd = 0; - cmph_uint32 start_index = 0; char ** keys_vd = NULL; - mph->key_source->rewind(mph->key_source->data); DEBUGP("Generating graphs from %u keys\n", brz->m); // Partitioning for (e = 0; e < brz->m; ++e) { mph->key_source->read(mph->key_source->data, &key, &keylen); - + /* Buffers management */ if (memory_usage + keylen + 1 > brz->memory_availability) // flush buffers - { + { if(mph->verbosity) { fprintf(stderr, "Flushing %u\n", nkeys_in_buffer); } cmph_uint32 value = buckets_size[0]; cmph_uint32 sum = 0; - - cmph_uint32 keylen1 = 0; - buckets_size[0] = 0; + cmph_uint32 keylen1 = 0; + buckets_size[0] = 0; for(i = 1; i < brz->k; i++) { if(buckets_size[i] == 0) continue; @@ -239,20 +248,20 @@ static int brz_gen_graphs(cmph_config_t *mph) keys_index = (cmph_uint32 *)calloc(nkeys_in_buffer, sizeof(cmph_uint32)); for(i = 0; i < nkeys_in_buffer; i++) { - keylen1 = strlen(buffer + memory_usage); - h3 = hash(brz->h3, buffer + memory_usage, keylen1) % brz->k; - keys_index[buckets_size[h3]] = memory_usage; - buckets_size[h3]++; + keylen1 = strlen((char *)(buffer + memory_usage)); + h0 = hash(brz->h0, (char *)(buffer + memory_usage), keylen1) % brz->k; + keys_index[buckets_size[h0]] = memory_usage; + buckets_size[h0]++; memory_usage = memory_usage + keylen1 + 1; } - filename = (char *)calloc(strlen(brz->tmp_dir) + 11, sizeof(char)); + filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char)); sprintf(filename, "%s%u.cmph",brz->tmp_dir, nflushes); tmp_fd = fopen(filename, "wb"); free(filename); filename = NULL; for(i = 0; i < nkeys_in_buffer; i++) { - keylen1 = strlen(buffer + keys_index[i]) + 1; + keylen1 = strlen((char *)(buffer + keys_index[i])) + 1; fwrite(buffer + keys_index[i], 1, keylen1, tmp_fd); } nkeys_in_buffer = 0; @@ -264,17 +273,16 @@ static int brz_gen_graphs(cmph_config_t *mph) } memcpy(buffer + memory_usage, key, keylen + 1); memory_usage = memory_usage + keylen + 1; - h3 = hash(brz->h3, key, keylen) % brz->k; - if ((brz->size[h3] == MAX_BUCKET_SIZE) || ((brz->c >= 1.0) && (cmph_uint8)(brz->c * brz->size[h3]) < brz->size[h3])) + h0 = hash(brz->h0, key, keylen) % brz->k; + if ((brz->size[h0] == MAX_BUCKET_SIZE) || ((brz->c >= 1.0) && (cmph_uint8)(brz->c * brz->size[h0]) < brz->size[h0])) { free(buffer); free(buckets_size); return 0; } - brz->size[h3] = brz->size[h3] + 1; - buckets_size[h3] ++; + brz->size[h0] = brz->size[h0] + 1; + buckets_size[h0] ++; nkeys_in_buffer++; - mph->key_source->dispose(mph->key_source->data, key, keylen); } @@ -299,20 +307,20 @@ static int brz_gen_graphs(cmph_config_t *mph) keys_index = (cmph_uint32 *)calloc(nkeys_in_buffer, sizeof(cmph_uint32)); for(i = 0; i < nkeys_in_buffer; i++) { - keylen1 = strlen(buffer + memory_usage); - h3 = hash(brz->h3, buffer + memory_usage, keylen1) % brz->k; - keys_index[buckets_size[h3]] = memory_usage; - buckets_size[h3]++; + keylen1 = strlen((char *)(buffer + memory_usage)); + h0 = hash(brz->h0, (char *)(buffer + memory_usage), keylen1) % brz->k; + keys_index[buckets_size[h0]] = memory_usage; + buckets_size[h0]++; memory_usage = memory_usage + keylen1 + 1; } - filename = (char *)calloc(strlen(brz->tmp_dir) + 11, sizeof(char)); + filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char)); sprintf(filename, "%s%u.cmph",brz->tmp_dir, nflushes); tmp_fd = fopen(filename, "wb"); free(filename); filename = NULL; for(i = 0; i < nkeys_in_buffer; i++) { - keylen1 = strlen(buffer + keys_index[i]) + 1; + keylen1 = strlen((char *)(buffer + keys_index[i])) + 1; fwrite(buffer + keys_index[i], 1, keylen1, tmp_fd); } nkeys_in_buffer = 0; @@ -322,66 +330,70 @@ static int brz_gen_graphs(cmph_config_t *mph) free(keys_index); fclose(tmp_fd); } + free(buffer); free(buckets_size); if(nflushes > 1024) return 0; // Too many files generated. - // mphf generation if(mph->verbosity) { fprintf(stderr, "\nMPHF generation \n"); } - tmp_fds = (FILE **)calloc(nflushes, sizeof(FILE *)); + /* Starting to dump to disk the resultant MPHF: __cmph_dump function */ + fwrite(cmph_names[CMPH_BRZ], (cmph_uint32)(strlen(cmph_names[CMPH_BRZ]) + 1), 1, brz->mphf_fd); + fwrite(&(brz->m), sizeof(brz->m), 1, brz->mphf_fd); + fwrite(&(brz->c), sizeof(cmph_float32), 1, brz->mphf_fd); + fwrite(&(brz->k), sizeof(cmph_uint32), 1, brz->mphf_fd); // number of MPHFs + fwrite(brz->size, sizeof(cmph_uint8)*(brz->k), 1, brz->mphf_fd); + + //tmp_fds = (FILE **)calloc(nflushes, sizeof(FILE *)); + buff_manage = buffer_manage_new(brz->memory_availability, nflushes); buffer_merge = (cmph_uint8 **)calloc(nflushes, sizeof(cmph_uint8 *)); - buffer_h3 = (cmph_uint32 *)calloc(nflushes, sizeof(cmph_uint32)); - filename = (char *)calloc(strlen(brz->tmp_dir) + 11, sizeof(char)); - sprintf(filename, "%stmpg.cmph",brz->tmp_dir); - tmp_fd = fopen(filename, "w"); - free(filename); + buffer_h0 = (cmph_uint32 *)calloc(nflushes, sizeof(cmph_uint32)); + memory_usage = 0; for(i = 0; i < nflushes; i++) { - filename = (char *)calloc(strlen(brz->tmp_dir) + 11, sizeof(char)); + filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char)); sprintf(filename, "%s%u.cmph",brz->tmp_dir, i); - tmp_fds[i] = fopen(filename, "rb"); + buffer_manage_open(buff_manage, i, filename); free(filename); filename = NULL; - key = brz_read_key(tmp_fds[i]); + key = (char *)buffer_manage_read_key(buff_manage, i); keylen = strlen(key); - h3 = hash(brz->h3, key, keylen) % brz->k; - buffer_h3[i] = h3; + h0 = hash(brz->h0, key, keylen) % brz->k; + buffer_h0[i] = h0; buffer_merge[i] = (cmph_uint8 *)calloc(keylen + 1, sizeof(cmph_uint8)); memcpy(buffer_merge[i], key, keylen + 1); free(key); } - e = 0; keys_vd = (char **)calloc(MAX_BUCKET_SIZE, sizeof(char *)); nkeys_vd = 0; while(e < brz->m) { - i = brz_min_index(buffer_h3, nflushes); - cur_bucket = buffer_h3[i]; - key = brz_read_key(tmp_fds[i]); + i = brz_min_index(buffer_h0, nflushes); + cur_bucket = buffer_h0[i]; + key = (char *)buffer_manage_read_key(buff_manage, i); if(key) { while(key) { keylen = strlen(key); - h3 = hash(brz->h3, key, keylen) % brz->k; - - if (h3 != buffer_h3[i]) break; - + h0 = hash(brz->h0, key, keylen) % brz->k; + if (h0 != buffer_h0[i]) break; keys_vd[nkeys_vd++] = key; + key = NULL; //transfer memory ownership e++; - key = brz_read_key(tmp_fds[i]); + key = (char *)buffer_manage_read_key(buff_manage, i); } if (key) { assert(nkeys_vd < brz->size[cur_bucket]); - keys_vd[nkeys_vd++] = buffer_merge[i]; + keys_vd[nkeys_vd++] = (char *)buffer_merge[i]; + buffer_merge[i] = NULL; //transfer memory ownership e++; - buffer_h3[i] = h3; + buffer_h0[i] = h0; buffer_merge[i] = (cmph_uint8 *)calloc(keylen + 1, sizeof(cmph_uint8)); memcpy(buffer_merge[i], key, keylen + 1); free(key); @@ -390,10 +402,10 @@ static int brz_gen_graphs(cmph_config_t *mph) if(!key) { assert(nkeys_vd < brz->size[cur_bucket]); - keys_vd[nkeys_vd++] = buffer_merge[i]; + keys_vd[nkeys_vd++] = (char *)buffer_merge[i]; + buffer_merge[i] = NULL; //transfer memory ownership e++; - buffer_h3[i] = UINT_MAX; - buffer_merge[i] = NULL; + buffer_h0[i] = UINT_MAX; } if(nkeys_vd == brz->size[cur_bucket]) // Generating mphf for each bucket. @@ -402,35 +414,33 @@ static int brz_gen_graphs(cmph_config_t *mph) cmph_config_t *config = NULL; cmph_t *mphf_tmp = NULL; bmz8_data_t * bmzf = NULL; + char *bufmphf = NULL; + cmph_uint32 buflenmphf = 0; // Source of keys - if(nkeys_vd > max_size) max_size = nkeys_vd; source = cmph_io_vector_adapter(keys_vd, (cmph_uint32)nkeys_vd); config = cmph_config_new(source); cmph_config_set_algo(config, CMPH_BMZ8); cmph_config_set_graphsize(config, brz->c); mphf_tmp = cmph_new(config); bmzf = (bmz8_data_t *)mphf_tmp->data; - brz_copy_partial_mphf(brz, bmzf, cur_bucket, source); - memory_usage += brz->size[cur_bucket]; - if((cur_bucket+1 == brz->k)||(memory_usage > brz->memory_availability)) - { - brz_flush_g(brz, &start_index, tmp_fd); - memory_usage = 0; - } + bufmphf = brz_copy_partial_mphf(brz, bmzf, cur_bucket, &buflenmphf); + bmzf = NULL; + fwrite(bufmphf, buflenmphf, 1, brz->mphf_fd); + free(bufmphf); + bufmphf = NULL; cmph_config_destroy(config); - brz_destroy_keys_vd(keys_vd, nkeys_vd); + brz_destroy_keys_vd(keys_vd, nkeys_vd); cmph_destroy(mphf_tmp); - free(source); + cmph_io_vector_adapter_destroy(source); + nkeys_vd = 0; } } - fclose(tmp_fd); - for(i = 0; i < nflushes; i++) fclose(tmp_fds[i]); - free(tmp_fds); + + buffer_manage_destroy(buff_manage); free(keys_vd); free(buffer_merge); - free(buffer_h3); - fprintf(stderr, "Maximal Size: %u\n", max_size); + free(buffer_h0); return 1; } @@ -467,7 +477,7 @@ static char * brz_read_key(FILE * fd) static void brz_destroy_keys_vd(char ** keys_vd, cmph_uint8 nkeys) { cmph_uint8 i; - for(i = 0; i < nkeys; i++) free(keys_vd[i]); + for(i = 0; i < nkeys; i++) { free(keys_vd[i]); keys_vd[i] = NULL;} } static void brz_flush_g(brz_config_data_t *brz, cmph_uint32 *start_index, FILE * fd) @@ -481,93 +491,84 @@ static void brz_flush_g(brz_config_data_t *brz, cmph_uint32 *start_index, FILE * } } -static void brz_copy_partial_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_io_adapter_t *source) +static char * brz_copy_partial_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen) +{ + cmph_uint32 i; + cmph_uint32 buflenh1 = 0; + cmph_uint32 buflenh2 = 0; + char * bufh1 = NULL; + char * bufh2 = NULL; + char * buf = NULL; + cmph_uint32 n = ceil(brz->c * brz->size[index]); + hash_state_dump(bmzf->hashes[0], &bufh1, &buflenh1); + hash_state_dump(bmzf->hashes[1], &bufh2, &buflenh2); + *buflen = buflenh1 + buflenh2 + n + 2*sizeof(cmph_uint32); + buf = (char *)malloc(*buflen); + //fprintf(stderr,"entrei passei\n"); + memcpy(buf, &buflenh1, sizeof(cmph_uint32)); + memcpy(buf+sizeof(cmph_uint32), bufh1, buflenh1); + memcpy(buf+sizeof(cmph_uint32)+buflenh1, &buflenh2, sizeof(cmph_uint32)); + memcpy(buf+2*sizeof(cmph_uint32)+buflenh1, bufh2, buflenh2); + memcpy(buf+2*sizeof(cmph_uint32)+buflenh1+buflenh2,bmzf->g, n); + free(bufh1); + free(bufh2); + return buf; +} +/*static void brz_copy_partial_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index) { cmph_uint32 i; cmph_uint32 n = ceil(brz->c * brz->size[index]); - + if( brz->g[index]) {fprintf(stderr, "index:%u\n",index);exit(10);} brz->g[index] = (cmph_uint8 *)calloc(n, sizeof(cmph_uint8)); for(i = 0; i < n; i++) { brz->g[index][i] = bmzf->g[i]; //fprintf(stderr, "gsrc[%u]: %u gdest: %u\n", i, (cmph_uint8) bmzf->g[i], brz->g[index][i]); - } - brz->h1[index] = hash_state_copy(bmzf->hashes[0]); - brz->h2[index] = hash_state_copy(bmzf->hashes[1]); + } + brz->h1[index] = hash_state_copy(bmzf->hashes[0]); + brz->h2[index] = hash_state_copy(bmzf->hashes[1]); } - +*/ int brz_dump(cmph_t *mphf, FILE *fd) { + brz_data_t *data = (brz_data_t *)mphf->data; char *buf = NULL; cmph_uint32 buflen; - cmph_uint32 i; - brz_data_t *data = (brz_data_t *)mphf->data; - FILE * tmpg_fd = NULL; - char * filename = NULL; - filename = (char *)calloc(strlen(data->tmp_dir) + 11, sizeof(char)); - sprintf(filename, "%stmpg.cmph",data->tmp_dir); - tmpg_fd = fopen(filename, "rb"); - free(filename); DEBUGP("Dumping brzf\n"); - __cmph_dump(mphf, fd); - - fwrite(&(data->k), sizeof(cmph_uint32), 1, fd); - //dumping h1 and h2. - for(i = 0; i < data->k; i++) - { - // h1 - hash_state_dump(data->h1[i], &buf, &buflen); - DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); - fwrite(&buflen, sizeof(cmph_uint32), 1, fd); - fwrite(buf, buflen, 1, fd); - free(buf); - // h2 - hash_state_dump(data->h2[i], &buf, &buflen); - DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); - fwrite(&buflen, sizeof(cmph_uint32), 1, fd); - fwrite(buf, buflen, 1, fd); - free(buf); - } - // Dumping h3. - hash_state_dump(data->h3, &buf, &buflen); - DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); - fwrite(&buflen, sizeof(cmph_uint32), 1, fd); - fwrite(buf, buflen, 1, fd); - free(buf); - - // Dumping c, m, size vector and offset vector. - fwrite(&(data->c), sizeof(cmph_float32), 1, fd); - fwrite(&(data->m), sizeof(cmph_uint32), 1, fd); - fwrite(data->size, sizeof(cmph_uint8)*(data->k), 1, fd); + // The initial part of the MPHF have already been dumped to disk during construction + // Dumping h0 + hash_state_dump(data->h0, &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + fwrite(&buflen, sizeof(cmph_uint32), 1, fd); + fwrite(buf, buflen, 1, fd); + free(buf); + // Dumping m and the vector offset. + fwrite(&(data->m), sizeof(cmph_uint32), 1, fd); fwrite(data->offset, sizeof(cmph_uint32)*(data->k), 1, fd); - - // Dumping g function. - for(i = 0; i < data->k; i++) - { - cmph_uint32 n = ceil(data->c * data->size[i]); - buf = (char *)calloc(n, sizeof(cmph_uint8)); - fread(buf, sizeof(cmph_uint8), n, tmpg_fd); - fwrite(buf, sizeof(cmph_uint8), n, fd); - free(buf); - } - fclose(tmpg_fd); return 1; } + + + void brz_load(FILE *f, cmph_t *mphf) { char *buf = NULL; cmph_uint32 buflen; - cmph_uint32 i; + cmph_uint32 i, n; brz_data_t *brz = (brz_data_t *)malloc(sizeof(brz_data_t)); DEBUGP("Loading brz mphf\n"); mphf->data = brz; + fread(&(brz->c), sizeof(cmph_float32), 1, f); fread(&(brz->k), sizeof(cmph_uint32), 1, f); + brz->size = (cmph_uint8 *) malloc(sizeof(cmph_uint8)*brz->k); + fread(brz->size, sizeof(cmph_uint8)*(brz->k), 1, f); brz->h1 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k); brz->h2 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k); + brz->g = (cmph_uint8 **) calloc(brz->k, sizeof(cmph_uint8 *)); DEBUGP("Reading %u h1 and %u h2\n", brz->k, brz->k); - //loading h1 and h2. + //loading h_i1, h_i2 and g_i. for(i = 0; i < brz->k; i++) { // h1 @@ -583,69 +584,62 @@ void brz_load(FILE *f, cmph_t *mphf) buf = (char *)malloc(buflen); fread(buf, buflen, 1, f); brz->h2[i] = hash_state_load(buf, buflen); - free(buf); + free(buf); + n = ceil(brz->c * brz->size[i]); + DEBUGP("g_i has %u bytes\n", n); + brz->g[i] = (cmph_uint8 *)calloc(n, sizeof(cmph_uint8)); + fread(brz->g[i], sizeof(cmph_uint8)*n, 1, f); } - //loading h3 + //loading h0 fread(&buflen, sizeof(cmph_uint32), 1, f); DEBUGP("Hash state has %u bytes\n", buflen); buf = (char *)malloc(buflen); fread(buf, buflen, 1, f); - brz->h3 = hash_state_load(buf, buflen); - free(buf); + brz->h0 = hash_state_load(buf, buflen); + free(buf); - //loading c, m, size vector and offset vector. - fread(&(brz->c), sizeof(cmph_float32), 1, f); + //loading c, m, and the vector offset. fread(&(brz->m), sizeof(cmph_uint32), 1, f); - brz->size = (cmph_uint8 *) malloc(sizeof(cmph_uint8)*brz->k); brz->offset = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*brz->k); - fread(brz->size, sizeof(cmph_uint8)*(brz->k), 1, f); - fread(brz->offset, sizeof(cmph_uint32)*(brz->k), 1, f); - - //loading g function. - brz->g = (cmph_uint8 **) malloc(sizeof(cmph_uint8 *)*brz->k); - for(i = 0; i < brz->k; i++) - { - cmph_uint32 n = ceil(brz->c * brz->size[i]); - brz->g[i] = (cmph_uint8 *)malloc(sizeof(cmph_uint8)*n); - fread(brz->g[i], sizeof(cmph_uint8)*n, 1, f); - } + fread(brz->offset, sizeof(cmph_uint32)*(brz->k), 1, f); return; } - cmph_uint32 brz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) { brz_data_t *brz = mphf->data; - cmph_uint32 h3 = hash(brz->h3, key, keylen) % brz->k; - cmph_uint32 m = brz->size[h3]; + cmph_uint32 h0 = hash(brz->h0, key, keylen) % brz->k; + cmph_uint32 m = brz->size[h0]; cmph_uint32 n = ceil(brz->c * m); - cmph_uint32 h1 = hash(brz->h1[h3], key, keylen) % n; - cmph_uint32 h2 = hash(brz->h2[h3], key, keylen) % n; + cmph_uint32 h1 = hash(brz->h1[h0], key, keylen) % n; + cmph_uint32 h2 = hash(brz->h2[h0], key, keylen) % n; cmph_uint8 mphf_bucket; if (h1 == h2 && ++h2 >= n) h2 = 0; - mphf_bucket = brz->g[h3][h1] + brz->g[h3][h2]; - DEBUGP("key: %s h1: %u h2: %u h3: %u\n", key, h1, h2, h3); - DEBUGP("key: %s g[h1]: %u g[h2]: %u offset[h3]: %u edges: %u\n", key, brz->g[h3][h1], brz->g[h3][h2], brz->offset[h3], brz->m); - DEBUGP("Address: %u\n", mphf_bucket + brz->offset[h3]); - return (mphf_bucket + brz->offset[h3]); + mphf_bucket = brz->g[h0][h1] + brz->g[h0][h2]; + DEBUGP("key: %s h1: %u h2: %u h0: %u\n", key, h1, h2, h0); + DEBUGP("key: %s g[h1]: %u g[h2]: %u offset[h0]: %u edges: %u\n", key, brz->g[h0][h1], brz->g[h0][h2], brz->offset[h0], brz->m); + DEBUGP("Address: %u\n", mphf_bucket + brz->offset[h0]); + return (mphf_bucket + brz->offset[h0]); } void brz_destroy(cmph_t *mphf) { cmph_uint32 i; brz_data_t *data = (brz_data_t *)mphf->data; - for(i = 0; i < data->k; i++) + if(data->g) { - free(data->g[i]); - hash_state_destroy(data->h1[i]); - hash_state_destroy(data->h2[i]); + for(i = 0; i < data->k; i++) + { + free(data->g[i]); + hash_state_destroy(data->h1[i]); + hash_state_destroy(data->h2[i]); + } + free(data->g); + free(data->h1); + free(data->h2); } - hash_state_destroy(data->h3); - free(data->g); - free(data->h1); - free(data->h2); + hash_state_destroy(data->h0); free(data->size); free(data->offset); - free(data->tmp_dir); free(data); free(mphf); } diff --git a/src/brz.h b/src/brz.h index e1c932e..88b9487 100644 --- a/src/brz.h +++ b/src/brz.h @@ -9,6 +9,8 @@ typedef struct __brz_config_data_t brz_config_data_t; brz_config_data_t *brz_config_new(); void brz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs); void brz_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir); +void brz_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd); +void brz_config_set_b(cmph_config_t *mph, cmph_uint8 b); void brz_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability); void brz_config_destroy(cmph_config_t *mph); cmph_t *brz_new(cmph_config_t *mph, float c); diff --git a/src/brz_structs.h b/src/brz_structs.h index e76e717..537dc12 100755 --- a/src/brz_structs.h +++ b/src/brz_structs.h @@ -13,8 +13,7 @@ struct __brz_data_t cmph_uint32 k; // number of components hash_state_t **h1; hash_state_t **h2; - hash_state_t * h3; - cmph_uint8 * tmp_dir; // temporary directory + hash_state_t * h0; }; struct __brz_config_data_t @@ -25,12 +24,14 @@ struct __brz_config_data_t cmph_uint8 *size; // size[i] stores the number of edges represented by g[i][...]. cmph_uint32 *offset; // offset[i] stores the sum: size[0] + size[1] + ... size[i-1]. cmph_uint8 **g; // g function. + cmph_uint8 b; // parameter b. cmph_uint32 k; // number of components hash_state_t **h1; hash_state_t **h2; - hash_state_t * h3; + hash_state_t * h0; cmph_uint32 memory_availability; cmph_uint8 * tmp_dir; // temporary directory + FILE * mphf_fd; // mphf file }; #endif diff --git a/src/cmph.c b/src/cmph.c index 07e888a..be4e64d 100644 --- a/src/cmph.c +++ b/src/cmph.c @@ -49,7 +49,8 @@ static int key_nlfile_read(void *data, char **key, cmph_uint32 *keylen) static int key_vector_read(void *data, char **key, cmph_uint32 *keylen) { - cmph_vector_t *cmph_vector = (cmph_vector_t *)data; +/* + cmph_vector_t *cmph_vector = (cmph_vector_t *)data; char **keys_vd = (char **)cmph_vector->vector; if (keys_vd + cmph_vector->position == NULL) return -1; @@ -57,7 +58,17 @@ static int key_vector_read(void *data, char **key, cmph_uint32 *keylen) *key = (char *)malloc(*keylen + 1); strcpy(*key, *(keys_vd + cmph_vector->position)); cmph_vector->position = cmph_vector->position + 1; +*/ + cmph_vector_t *cmph_vector = (cmph_vector_t *)data; + char **keys_vd = (char **)cmph_vector->vector; + +// if (keys_vd + cmph_vector->position == NULL) return -1; + *keylen = strlen(keys_vd[cmph_vector->position]); + *key = (char *)malloc(*keylen + 1); + strcpy(*key, keys_vd[cmph_vector->position]); + cmph_vector->position = cmph_vector->position + 1; return *keylen; + } @@ -68,7 +79,7 @@ static void key_nlfile_dispose(void *data, char *key, cmph_uint32 keylen) static void key_vector_dispose(void *data, char *key, cmph_uint32 keylen) { - key_nlfile_dispose(data, key, keylen); + free(key); } static void key_nlfile_rewind(void *data) @@ -236,7 +247,43 @@ void cmph_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir) default: assert(0); } +} + +void cmph_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd) +{ + switch (mph->algo) + { + case CMPH_CHM: + break; + case CMPH_BMZ: /* included -- Fabiano */ + break; + case CMPH_BMZ8: /* included -- Fabiano */ + break; + case CMPH_BRZ: /* included -- Fabiano */ + brz_config_set_mphf_fd(mph, mphf_fd); + break; + default: + assert(0); + } +} + +void cmph_config_set_b(cmph_config_t *mph, cmph_uint8 b) +{ + switch (mph->algo) + { + case CMPH_CHM: + break; + case CMPH_BMZ: /* included -- Fabiano */ + break; + case CMPH_BMZ8: /* included -- Fabiano */ + break; + case CMPH_BRZ: /* included -- Fabiano */ + brz_config_set_b(mph, b); + break; + default: + assert(0); + } } void cmph_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability) diff --git a/src/cmph.h b/src/cmph.h index 50b2d66..0d42bcb 100644 --- a/src/cmph.h +++ b/src/cmph.h @@ -41,6 +41,8 @@ void cmph_config_set_verbosity(cmph_config_t *mph, cmph_uint32 verbosity); void cmph_config_set_graphsize(cmph_config_t *mph, float c); void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo); void cmph_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir); +void cmph_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd); +void cmph_config_set_b(cmph_config_t *mph, cmph_uint8 b); void cmph_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability); void cmph_config_destroy(cmph_config_t *mph); diff --git a/src/jenkins_hash.c b/src/jenkins_hash.c index 4ed64f5..d720fcd 100644 --- a/src/jenkins_hash.c +++ b/src/jenkins_hash.c @@ -89,9 +89,6 @@ jenkins_state_t *jenkins_state_new(cmph_uint32 size) //size of hash table jenkins_state_t *state = (jenkins_state_t *)malloc(sizeof(jenkins_state_t)); DEBUGP("Initializing jenkins hash\n"); state->seed = rand() % size; - state->nbits = (cmph_uint32)ceil(log(size)/M_LOG2E); - state->size = size; - DEBUGP("Initialized jenkins with size %u, nbits %u and seed %u\n", size, state->nbits, state->seed); return state; } void jenkins_state_destroy(jenkins_state_t *state) @@ -164,7 +161,7 @@ cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keyl void jenkins_state_dump(jenkins_state_t *state, char **buf, cmph_uint32 *buflen) { - *buflen = sizeof(cmph_uint32)*3; + *buflen = sizeof(cmph_uint32); *buf = malloc(*buflen); if (!*buf) { @@ -172,10 +169,7 @@ void jenkins_state_dump(jenkins_state_t *state, char **buf, cmph_uint32 *buflen) return; } memcpy(*buf, &(state->seed), sizeof(cmph_uint32)); - memcpy(*buf + sizeof(cmph_uint32), &(state->nbits), sizeof(cmph_uint32)); - memcpy(*buf + sizeof(cmph_uint32)*2, &(state->size), sizeof(cmph_uint32)); DEBUGP("Dumped jenkins state with seed %u\n", state->seed); - return; } @@ -184,8 +178,6 @@ jenkins_state_t *jenkins_state_copy(jenkins_state_t *src_state) jenkins_state_t *dest_state = (jenkins_state_t *)malloc(sizeof(jenkins_state_t)); dest_state->hashfunc = src_state->hashfunc; dest_state->seed = src_state->seed; - dest_state->nbits = src_state->nbits; - dest_state->size = src_state->size; return dest_state; } @@ -193,8 +185,6 @@ jenkins_state_t *jenkins_state_load(const char *buf, cmph_uint32 buflen) { jenkins_state_t *state = (jenkins_state_t *)malloc(sizeof(jenkins_state_t)); state->seed = *(cmph_uint32 *)buf; - state->nbits = *(((cmph_uint32 *)buf) + 1); - state->size = *(((cmph_uint32 *)buf) + 2); state->hashfunc = CMPH_HASH_JENKINS; DEBUGP("Loaded jenkins state with seed %u\n", state->seed); return state; diff --git a/src/jenkins_hash.h b/src/jenkins_hash.h index e259ee2..17b0cf9 100644 --- a/src/jenkins_hash.h +++ b/src/jenkins_hash.h @@ -7,8 +7,6 @@ typedef struct __jenkins_state_t { CMPH_HASH hashfunc; cmph_uint32 seed; - cmph_uint32 nbits; - cmph_uint32 size; } jenkins_state_t; jenkins_state_t *jenkins_state_new(cmph_uint32 size); //size of hash table diff --git a/src/main.c b/src/main.c index 8a7f977..4db2104 100644 --- a/src/main.c +++ b/src/main.c @@ -22,12 +22,12 @@ void usage(const char *prg) { - fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-d tmp_dir] [-m file.mph] keysfile\n", prg); + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b BRZ_parameter] [-d tmp_dir] [-m file.mph] keysfile\n", prg); } void usage_long(const char *prg) { cmph_uint32 i; - fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-d tmp_dir] [-m file.mph] keysfile\n", prg); + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b BRZ_parameter] [-d tmp_dir] [-m file.mph] keysfile\n", prg); fprintf(stderr, "Minimum perfect hashing tool\n\n"); fprintf(stderr, " -h\t print this help message\n"); fprintf(stderr, " -c\t c value that determines the number of vertices in the graph\n"); @@ -43,13 +43,13 @@ void usage_long(const char *prg) fprintf(stderr, " -m\t minimum perfect hash function file \n"); fprintf(stderr, " -M\t main memory availability (in MB)\n"); fprintf(stderr, " -d\t temporary directory used in brz algorithm \n"); + fprintf(stderr, " -b\t parmeter of BRZ algorithm to make the maximal number of keys in a bucket lower than 256\n"); fprintf(stderr, " keysfile\t line separated file with keys\n"); } - int main(int argc, char **argv) { - char verbosity = 0; + char verbosity = 0; char generate = 0; char *mphf_file = NULL; FILE *mphf_fd = stdout; @@ -67,9 +67,10 @@ int main(int argc, char **argv) cmph_uint8 * tmp_dir = NULL; cmph_io_adapter_t *source; cmph_uint32 memory_availability = 0; + cmph_uint32 b = 128; while (1) { - char ch = getopt(argc, argv, "hVvgc:k:a:M:f:m:d:s:"); + char ch = getopt(argc, argv, "hVvgc:k:a:M:b:f:m:d:s:"); if (ch == -1) break; switch (ch) { @@ -122,6 +123,16 @@ int main(int argc, char **argv) } } break; + case 'b': + { + char *cptr; + b = strtoul(optarg, &cptr, 10); + if(*cptr != 0) { + fprintf(stderr, "Parameter b was not found: %s\n", optarg); + exit(1); + } + } + break; case 'v': ++verbosity; break; @@ -184,9 +195,9 @@ int main(int argc, char **argv) return 1; } keys_file = argv[optind]; + if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL); srand(seed); - int ret = 0; if (mphf_file == NULL) { @@ -196,6 +207,7 @@ int main(int argc, char **argv) } keys_fd = fopen(keys_file, "r"); + if (keys_fd == NULL) { fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno)); @@ -209,33 +221,35 @@ int main(int argc, char **argv) if (generate) { //Create mphf + mphf_fd = fopen(mphf_file, "w"); config = cmph_config_new(source); cmph_config_set_algo(config, mph_algo); if (nhashes) cmph_config_set_hashfuncs(config, hashes); cmph_config_set_verbosity(config, verbosity); cmph_config_set_tmp_dir(config, tmp_dir); + cmph_config_set_mphf_fd(config, mphf_fd); cmph_config_set_memory_availability(config, memory_availability); + cmph_config_set_b(config, b); if(mph_algo == CMPH_BMZ && c >= 2.0) c=1.15; if (c != 0) cmph_config_set_graphsize(config, c); mphf = cmph_new(config); - + cmph_config_destroy(config); if (mphf == NULL) { fprintf(stderr, "Unable to create minimum perfect hashing function\n"); - cmph_config_destroy(config); + //cmph_config_destroy(config); free(mphf_file); return -1; } - mphf_fd = fopen(mphf_file, "w"); if (mphf_fd == NULL) { fprintf(stderr, "Unable to open output file %s: %s\n", mphf_file, strerror(errno)); free(mphf_file); return -1; } - cmph_dump(mphf, mphf_fd); - cmph_destroy(mphf); + cmph_dump(mphf, mphf_fd); + cmph_destroy(mphf); fclose(mphf_fd); } else @@ -289,6 +303,7 @@ int main(int argc, char **argv) fclose(keys_fd); free(mphf_file); free(tmp_dir); - free(source); + cmph_io_nlfile_adapter_destroy(source); return ret; + } diff --git a/src/vqueue.c b/src/vqueue.c index 0e55095..c2cf3bb 100644 --- a/src/vqueue.c +++ b/src/vqueue.c @@ -46,5 +46,5 @@ void vqueue_print(vqueue_t * q) void vqueue_destroy(vqueue_t *q) { - free(q->values); q->values = NULL; + free(q->values); q->values = NULL; free(q); } diff --git a/vldb/pt/figs/brz.fig b/vldb/pt/figs/brz.fig index 4f11d1e..e08aae4 100644 --- a/vldb/pt/figs/brz.fig +++ b/vldb/pt/figs/brz.fig @@ -8,96 +8,79 @@ Single -2 1200 2 0 32 #bebebe -6 3285 3600 3555 4230 -6 3285 3780 3555 4230 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 3285 4140 3555 4140 3555 4230 3285 4230 3285 4140 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 3285 4050 3555 4050 3555 4140 3285 4140 3285 4050 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 3285 3960 3555 3960 3555 4050 3285 4050 3285 3960 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 3285 3870 3555 3870 3555 3960 3285 3960 3285 3870 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 3285 3780 3555 3780 3555 3870 3285 3870 3285 3780 --6 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 3285 3690 3555 3690 3555 3780 3285 3780 3285 3690 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 3285 3600 3555 3600 3555 3690 3285 3690 3285 3600 --6 -6 1800 4500 3330 5175 +6 2025 3015 3555 3690 2 3 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 - 1800 4770 2070 4770 2070 4500 3060 4500 3060 4770 3330 4770 - 2565 5175 1800 4770 -4 0 0 50 -1 0 10 0.0000 4 150 600 2265 4867 Spreading\001 + 2025 3285 2295 3285 2295 3015 3285 3015 3285 3285 3555 3285 + 2790 3690 2025 3285 +4 0 0 50 -1 0 10 0.0000 4 135 765 2385 3330 Partitioning\001 -6 -6 2250 3060 2880 3600 -6 2250 3060 2880 3600 -6 2250 3060 2880 3600 -6 2250 3060 2880 3600 -2 3 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 - 2250 3330 2430 3330 2430 3060 2700 3060 2700 3330 2880 3330 - 2565 3600 2250 3330 +6 1890 3735 3780 4365 +6 2430 3735 2700 4365 +6 2430 3915 2700 4365 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2430 4275 2700 4275 2700 4365 2430 4365 2430 4275 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2430 4185 2700 4185 2700 4275 2430 4275 2430 4185 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2430 4095 2700 4095 2700 4185 2430 4185 2430 4095 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2430 4005 2700 4005 2700 4095 2430 4095 2430 4005 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2430 3915 2700 3915 2700 4005 2430 4005 2430 3915 -6 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2430 3825 2700 3825 2700 3915 2430 3915 2430 3825 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2430 3735 2700 3735 2700 3825 2430 3825 2430 3735 -6 -4 0 0 50 -1 0 10 0.0000 4 105 75 2521 3382 h\001 --6 -4 0 0 50 -1 0 6 0.0000 4 60 45 2589 3419 1\001 --6 -6 1395 2655 3825 2970 -2 4 0 1 0 7 50 -1 -1 0.000 0 0 7 0 0 5 - 3825 2970 3825 2655 1395 2655 1395 2970 3825 2970 -4 0 0 50 -1 0 10 0.0000 4 135 795 2212 2850 Set of Keys S\001 --6 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 1890 4275 2160 4275 2160 4365 1890 4365 1890 4275 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 1890 4185 2160 4185 2160 4275 1890 4275 1890 4185 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2160 4275 2430 4275 2430 4365 2160 4365 2160 4275 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2160 4185 2430 4185 2430 4275 2160 4275 2160 4185 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2160 4095 2430 4095 2430 4185 2160 4185 2160 4095 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2160 4005 2430 4005 2430 4095 2160 4095 2160 4005 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2160 3915 2430 3915 2430 4005 2160 4005 2160 3915 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2700 4275 2970 4275 2970 4365 2700 4365 2700 4275 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2700 4185 2970 4185 2970 4275 2700 4275 2700 4185 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2700 4095 2970 4095 2970 4185 2700 4185 2700 4095 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2700 4005 2970 4005 2970 4095 2700 4095 2700 4005 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2160 3825 2430 3825 2430 3915 2160 3915 2160 3825 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 3240 4275 3510 4275 3510 4365 3240 4365 3240 4275 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 3510 4275 3780 4275 3780 4365 3510 4365 3510 4275 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2970 4275 3240 4275 3240 4365 2970 4365 2970 4275 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 3240 4185 3510 4185 3510 4275 3240 4275 3240 4185 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 1890 4095 2160 4095 2160 4185 1890 4185 1890 4095 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 3510 4185 3780 4185 3780 4275 3510 4275 3510 4185 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 3240 4095 3510 4095 3510 4185 3240 4185 3240 4095 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 3240 4005 3510 4005 3510 4095 3240 4095 3240 4005 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 3240 3915 3510 3915 3510 4005 3240 4005 3240 3915 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 - 1395 4230 3825 4230 + 1890 4365 3780 4365 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 1395 4140 1665 4140 1665 4230 1395 4230 1395 4140 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 1395 4050 1665 4050 1665 4140 1395 4140 1395 4050 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 1665 4140 1935 4140 1935 4230 1665 4230 1665 4140 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 1665 4050 1935 4050 1935 4140 1665 4140 1665 4050 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 1665 3960 1935 3960 1935 4050 1665 4050 1665 3960 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 1665 3870 1935 3870 1935 3960 1665 3960 1665 3870 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 1665 3780 1935 3780 1935 3870 1665 3870 1665 3780 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 2205 4140 2475 4140 2475 4230 2205 4230 2205 4140 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 2205 4050 2475 4050 2475 4140 2205 4140 2205 4050 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 2205 3960 2475 3960 2475 4050 2205 4050 2205 3960 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 2205 3870 2475 3870 2475 3960 2205 3960 2205 3870 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 1665 3690 1935 3690 1935 3780 1665 3780 1665 3690 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 2745 4140 3015 4140 3015 4230 2745 4230 2745 4140 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 3015 4140 3285 4140 3285 4230 3015 4230 3015 4140 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 2475 4140 2745 4140 2745 4230 2475 4230 2475 4140 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 2745 4050 3015 4050 3015 4140 2745 4140 2745 4050 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 1395 3960 1665 3960 1665 4050 1395 4050 1395 3960 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 3555 4140 3825 4140 3825 4230 3555 4230 3555 4140 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 3555 4050 3825 4050 3825 4140 3555 4140 3555 4050 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 3015 4050 3285 4050 3285 4140 3015 4140 3015 4050 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 2745 3960 3015 3960 3015 4050 2745 4050 2745 3960 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 2745 3870 3015 3870 3015 3960 2745 3960 2745 3870 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 2745 3780 3015 3780 3015 3870 2745 3870 2745 3780 + 2970 4185 3240 4185 3240 4275 2970 4275 2970 4185 +-6 +6 1260 5310 4230 5580 2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 1260 5400 4230 5400 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 @@ -122,14 +105,49 @@ Single 3150 5310 3420 5310 3420 5400 3150 5400 3150 5310 2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 1260 5310 1530 5310 1530 5400 1260 5400 1260 5310 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 3285 3510 3555 3510 3555 3600 3285 3600 3285 3510 -2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 - 3285 3420 3555 3420 3555 3510 3285 3510 3285 3420 -4 0 0 50 -1 0 10 0.0000 4 105 75 1485 4410 0\001 -4 0 0 50 -1 0 10 0.0000 4 105 210 3600 4410 b-1\001 -4 0 0 50 -1 0 10 0.0000 4 105 480 720 4050 Buckets\001 -4 0 0 50 -1 0 10 0.0000 4 105 90 900 4230 B\001 4 0 0 50 -1 0 10 0.0000 4 105 210 4005 5580 n-1\001 4 0 0 50 -1 0 10 0.0000 4 105 75 1350 5580 0\001 -4 0 0 50 -1 0 10 0.0000 4 105 690 450 5400 Hash Table\001 +-6 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 1260 2925 4230 2925 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 1530 2835 1800 2835 1800 2925 1530 2925 1530 2835 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2070 2835 2340 2835 2340 2925 2070 2925 2070 2835 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2340 2835 2610 2835 2610 2925 2340 2925 2340 2835 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2610 2835 2880 2835 2880 2925 2610 2925 2610 2835 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 2880 2835 3150 2835 3150 2925 2880 2925 2880 2835 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 3420 2835 3690 2835 3690 2925 3420 2925 3420 2835 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 3690 2835 3960 2835 3960 2925 3690 2925 3690 2835 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 3960 2835 4230 2835 4230 2925 3960 2925 3960 2835 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 1800 2835 2070 2835 2070 2925 1800 2925 1800 2835 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 3150 2835 3420 2835 3420 2925 3150 2925 3150 2835 +2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5 + 1260 2835 1530 2835 1530 2925 1260 2925 1260 2835 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 3510 4410 3510 4590 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 3510 4410 3600 4410 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 3690 4410 3780 4410 +2 3 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 8 + 2025 4815 2295 4815 2295 4545 3285 4545 3285 4815 3555 4815 + 2790 5220 2025 4815 +2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 2 + 3780 4410 3780 4590 +4 0 0 50 -1 0 10 0.0000 4 135 585 2475 4860 Searching\001 +4 0 0 50 -1 0 10 0.0000 4 105 75 1980 4545 0\001 +4 0 0 50 -1 0 10 0.0000 4 105 690 4410 5400 Hash Table\001 +4 0 0 50 -1 0 10 0.0000 4 105 480 4410 4230 Buckets\001 +4 0 0 50 -1 0 10 0.0000 4 135 555 4410 2925 Key set S\001 +4 0 0 50 -1 0 10 0.0000 4 105 75 1350 2745 0\001 +4 0 0 50 -1 0 10 0.0000 4 105 210 4005 2745 n-1\001 +4 0 0 50 -1 0 10 0.0000 4 105 420 3555 4545 n/b - 1\001