diff --git a/FAQ.t2t b/FAQ.t2t new file mode 100644 index 0000000..f0d40d2 --- /dev/null +++ b/FAQ.t2t @@ -0,0 +1,27 @@ +CMPH FAQ + + + +- How do I define the ids of the keys? + - You don't. The ids will be assigned by the algorithm creating the minimal + perfect hash function. If the algorithm creates an **ordered** minimal + perfect hash function, the ids will be the indices of the keys in the + input. Otherwise, you have no guarantee of the distribution of the ids. + +- Why I always get the error "Unable to create minimum perfect hashing function"? + - The algorithms do not guarantee that a minimal perfect hash function can + be created. In practice, it will always work if your input + is big enough (>100 keys). + The error is probably because you have duplicated + keys in the input. You must guarantee that the keys are unique in the + input. If you are using a UN*X based OS, try doing +``` #sort input.txt | uniq > input_uniq.txt + and run cmph with input_uniq.txt + +---------------------------------------- +[Home index.html] +---------------------------------------- + +Davi de Castro Reis + +Fabiano Cupertino Botelho diff --git a/README.t2t b/README.t2t index e0ebc1d..8b12570 100644 --- a/README.t2t +++ b/README.t2t @@ -159,6 +159,10 @@ utility. keysfile line separated file with keys ``` +**Additional Documentation** + +[FAQ faq.html] + **Downloads** Use the project page at sourceforge: http://sf.net/projects/cmph @@ -171,9 +175,9 @@ Code is under the LGPL. Enjoy! -Davi de Castro Reis +Davi de Castro Reis davi@users.sourceforge.net -Fabiano Cupertino Botelho +Fabiano Cupertino Botelho fc_botelho@users.sourceforge.net %!include(html): ''LOGO.html'' Last Updated: %%date(%c) diff --git a/gendocs b/gendocs index 71021f7..e7c5e3e 100755 --- a/gendocs +++ b/gendocs @@ -1,6 +1,13 @@ -txt2tags -t html -i README.t2t -o index.html +txt2tags -t html --mask-email -i README.t2t -o index.html txt2tags -t html -i BMZ.t2t -o bmz.html txt2tags -t html -i CHM.t2t -o chm.html txt2tags -t html -i COMPARISON.t2t -o comparison.html txt2tags -t html -i GPERF.t2t -o gperf.html -txt2tags -t txt -i README.t2t -o README +txt2tags -t html -i FAQ.t2t -o faq.html + +txt2tags -t txt --mask-email -i README.t2t -o README +txt2tags -t txt -i BMZ.t2t -o BMZ +txt2tags -t txt -i CHM.t2t -o CHM +txt2tags -t txt -i COMPARISON.t2t -o COMPARISON +txt2tags -t txt -i GPERF.t2t -o GPERF +txt2tags -t txt -i FAQ.t2t -o FAQ diff --git a/src/bmz.c b/src/bmz.c index bc75015..895f2bd 100644 --- a/src/bmz.c +++ b/src/bmz.c @@ -27,7 +27,7 @@ static cmph_uint8 bmz_traverse_critical_nodes(bmz_config_data_t *bmz, cmph_uint3 static cmph_uint8 bmz_traverse_critical_nodes_heuristic(bmz_config_data_t *bmz, cmph_uint32 v, cmph_uint32 * biggest_g_value, cmph_uint32 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited); static void bmz_traverse_non_critical_nodes(bmz_config_data_t *bmz, cmph_uint8 * used_edges, cmph_uint8 * visited); -bmz_config_data_t *bmz_config_new(cmph_io_adapter_t *key_source) +bmz_config_data_t *bmz_config_new() { bmz_config_data_t *bmz = NULL; bmz = (bmz_config_data_t *)malloc(sizeof(bmz_config_data_t)); diff --git a/src/bmz.h b/src/bmz.h index 13c0f87..2d444a0 100644 --- a/src/bmz.h +++ b/src/bmz.h @@ -6,7 +6,7 @@ typedef struct __bmz_data_t bmz_data_t; typedef struct __bmz_config_data_t bmz_config_data_t; -bmz_config_data_t *bmz_config_new(cmph_io_adapter_t *key_source); +bmz_config_data_t *bmz_config_new(); void bmz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs); void bmz_config_destroy(cmph_config_t *mph); cmph_t *bmz_new(cmph_config_t *mph, float c); diff --git a/src/chm.c b/src/chm.c index 5739c3b..486d438 100644 --- a/src/chm.c +++ b/src/chm.c @@ -10,20 +10,14 @@ #include #include #include -#include //#define DEBUG #include "debug.h" -/* static const char bitmask[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; */ -/* #define GETBIT(array, i) (array[(i) / 8] & bitmask[(i) % 8]) */ -/* #define SETBIT(array, i) (array[(i) / 8] |= bitmask[(i) % 8]) */ -/* #define UNSETBIT(array, i) (array[(i) / 8] &= (~(bitmask[(i) % 8]))) */ - static int chm_gen_edges(cmph_config_t *mph); static void chm_traverse(chm_config_data_t *chm, cmph_uint8 *visited, cmph_uint32 v); -chm_config_data_t *chm_config_new(cmph_io_adapter_t *key_source) +chm_config_data_t *chm_config_new() { chm_config_data_t *chm = NULL; chm = (chm_config_data_t *)malloc(sizeof(chm_config_data_t)); @@ -173,7 +167,7 @@ static int chm_gen_edges(cmph_config_t *mph) chm_config_data_t *chm = (chm_config_data_t *)mph->data; int cycles = 0; - DEBUGP("Generating edges for %u vertices\n", chm->n); + DEBUGP("Generating edges for %u vertices with hash functions %s and %s\n", chm->n, cmph_hash_names[chm->hashfuncs[0]], cmph_hash_names[chm->hashfuncs[1]]); graph_clear_edges(chm->graph); mph->key_source->rewind(mph->key_source->data); for (e = 0; e < mph->key_source->nkeys; ++e) @@ -206,39 +200,28 @@ int chm_dump(cmph_t *mphf, FILE *fd) { char *buf = NULL; cmph_uint32 buflen; - cmph_uint32 nbuflen; cmph_uint32 i; - cmph_uint32 two = htonl(2); //number of hash functions + cmph_uint32 two = 2; //number of hash functions chm_data_t *data = (chm_data_t *)mphf->data; - cmph_uint32 nn, nm; __cmph_dump(mphf, fd); fwrite(&two, sizeof(cmph_uint32), 1, fd); - hash_state_dump(data->hashes[0], &buf, &buflen); DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); - nbuflen = htonl(buflen); - fwrite(&nbuflen, sizeof(cmph_uint32), 1, fd); + fwrite(&buflen, sizeof(cmph_uint32), 1, fd); fwrite(buf, buflen, 1, fd); free(buf); hash_state_dump(data->hashes[1], &buf, &buflen); DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); - nbuflen = htonl(buflen); - fwrite(&nbuflen, sizeof(cmph_uint32), 1, fd); + fwrite(&buflen, sizeof(cmph_uint32), 1, fd); fwrite(buf, buflen, 1, fd); free(buf); - nn = htonl(data->n); - fwrite(&nn, sizeof(cmph_uint32), 1, fd); - nm = htonl(data->m); - fwrite(&nm, sizeof(cmph_uint32), 1, fd); + fwrite(&(data->n), sizeof(cmph_uint32), 1, fd); + fwrite(&(data->m), sizeof(cmph_uint32), 1, fd); - for (i = 0; i < data->n; ++i) - { - cmph_uint32 ng = htonl(data->g[i]); - fwrite(&ng, sizeof(cmph_uint32), 1, fd); - } + fwrite(data->g, sizeof(cmph_uint32)*data->n, 1, fd); #ifdef DEBUG fprintf(stderr, "G: "); for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]); @@ -260,7 +243,6 @@ void chm_load(FILE *f, cmph_t *mphf) DEBUGP("Loading chm mphf\n"); mphf->data = chm; fread(&nhashes, sizeof(cmph_uint32), 1, f); - nhashes = ntohl(nhashes); chm->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*(nhashes + 1)); chm->hashes[nhashes] = NULL; DEBUGP("Reading %u hashes\n", nhashes); @@ -268,7 +250,6 @@ void chm_load(FILE *f, cmph_t *mphf) { hash_state_t *state = NULL; fread(&buflen, sizeof(cmph_uint32), 1, f); - buflen = ntohl(buflen); DEBUGP("Hash state has %u bytes\n", buflen); buf = (char *)malloc(buflen); fread(buf, buflen, 1, f); @@ -279,13 +260,10 @@ void chm_load(FILE *f, cmph_t *mphf) DEBUGP("Reading m and n\n"); fread(&(chm->n), sizeof(cmph_uint32), 1, f); - chm->n = ntohl(chm->n); fread(&(chm->m), sizeof(cmph_uint32), 1, f); - chm->m = ntohl(chm->m); chm->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*chm->n); fread(chm->g, chm->n*sizeof(cmph_uint32), 1, f); - for (i = 0; i < chm->n; ++i) chm->g[i] = ntohl(chm->g[i]); #ifdef DEBUG fprintf(stderr, "G: "); for (i = 0; i < chm->n; ++i) fprintf(stderr, "%u ", chm->g[i]); diff --git a/src/chm.h b/src/chm.h index f98d83d..0f7ac3f 100644 --- a/src/chm.h +++ b/src/chm.h @@ -6,7 +6,7 @@ typedef struct __chm_data_t chm_data_t; typedef struct __chm_config_data_t chm_config_data_t; -chm_config_data_t *chm_config_new(cmph_io_adapter_t *key_source); +chm_config_data_t *chm_config_new(); void chm_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs); void chm_config_destroy(cmph_config_t *mph); cmph_t *chm_new(cmph_config_t *mph, float c); diff --git a/src/cmph.c b/src/cmph.c index 6aed7ee..490db0e 100644 --- a/src/cmph.c +++ b/src/cmph.c @@ -98,12 +98,38 @@ cmph_config_t *cmph_config_new(cmph_io_adapter_t *key_source) mph = __config_new(key_source); assert(mph); mph->algo = CMPH_CHM; // default value + mph->data = chm_config_new(); return mph; } void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo) { - mph->algo = algo; + if (algo != mph->algo) + { + switch (mph->algo) + { + case CMPH_CHM: + chm_config_destroy(mph->data); + break; + case CMPH_BMZ: + bmz_config_destroy(mph->data); + break; + default: + assert(0); + } + switch(algo) + { + case CMPH_CHM: + mph->data = chm_config_new(); + break; + case CMPH_BMZ: + mph->data = bmz_config_new(); + break; + default: + assert(0); + } + } + mph->algo = algo; } void cmph_config_destroy(cmph_config_t *mph) @@ -115,7 +141,7 @@ void cmph_config_destroy(cmph_config_t *mph) chm_config_destroy(mph); break; case CMPH_BMZ: /* included -- Fabiano */ - bmz_config_destroy(mph); + bmz_config_destroy(mph); break; default: assert(0); @@ -159,13 +185,11 @@ cmph_t *cmph_new(cmph_config_t *mph) { case CMPH_CHM: DEBUGP("Creating chm hash\n"); - mph->data = chm_config_new(mph->key_source); if (c == 0) c = 2.09; mphf = chm_new(mph, c); break; case CMPH_BMZ: /* included -- Fabiano */ DEBUGP("Creating bmz hash\n"); - mph->data = bmz_config_new(mph->key_source); if (c == 0) c = 1.15; mphf = bmz_new(mph, c); break; @@ -205,8 +229,8 @@ cmph_t *cmph_load(FILE *f) chm_load(f, mphf); break; case CMPH_BMZ: /* included -- Fabiano */ - DEBUGP("Loading bmz algorithm dependent parts\n"); - bmz_load(f, mphf); + DEBUGP("Loading bmz algorithm dependent parts\n"); + bmz_load(f, mphf); break; default: assert(0); diff --git a/src/cmph_structs.c b/src/cmph_structs.c index d6c4306..553db3d 100644 --- a/src/cmph_structs.c +++ b/src/cmph_structs.c @@ -12,6 +12,7 @@ cmph_config_t *__config_new(cmph_io_adapter_t *key_source) if (mph == NULL) return NULL; mph->key_source = key_source; mph->verbosity = 0; + mph->data = NULL; float c = 0; return mph; } @@ -23,9 +24,8 @@ void __config_destroy(cmph_config_t *mph) void __cmph_dump(cmph_t *mphf, FILE *fd) { - cmph_uint32 nsize = htonl(mphf->size); fwrite(cmph_names[mphf->algo], (cmph_uint32)(strlen(cmph_names[mphf->algo]) + 1), 1, fd); - fwrite(&nsize, sizeof(mphf->size), 1, fd); + fwrite(&(mphf->size), sizeof(mphf->size), 1, fd); } cmph_t *__cmph_load(FILE *f) { @@ -58,7 +58,6 @@ cmph_t *__cmph_load(FILE *f) mphf = (cmph_t *)malloc(sizeof(cmph_t)); mphf->algo = algo; fread(&(mphf->size), sizeof(mphf->size), 1, f); - mphf->size = ntohl(mphf->size); mphf->data = NULL; DEBUGP("Algorithm is %s and mphf is sized %u\n", cmph_names[algo], mphf->size); diff --git a/src/debug.h b/src/debug.h index daab6c7..0f7ddb1 100644 --- a/src/debug.h +++ b/src/debug.h @@ -14,7 +14,7 @@ #endif #endif -#ifdef WIN32 +#ifndef __GNUC__ #ifndef __DEBUG_H__ #define __DEBUG_H__ #include @@ -39,13 +39,13 @@ static void dummyprintf(const char *format, ...) #endif #ifdef DEBUG -#ifdef WIN32 +#ifndef __GNUC__ #define DEBUGP debugprintf #else #define DEBUGP(args...) do { fprintf(stderr, "%s:%d ", __FILE__, __LINE__); fprintf(stderr, ## args); } while(0) #endif #else -#ifdef WIN32 +#ifndef __GNUC__ #define DEBUGP dummyprintf #else #define DEBUGP(args...) diff --git a/src/main.c b/src/main.c index 537d28e..c010123 100644 --- a/src/main.c +++ b/src/main.c @@ -1,4 +1,8 @@ +#ifdef WIN32 #include "../wingetopt.h" +#else +#include +#endif #include #include #include @@ -218,7 +222,7 @@ int main(int argc, char **argv) } else { - cmph_uint8 * hashtable = NULL; + cmph_uint8 * hashtable = NULL; mphf_fd = fopen(mphf_file, "r"); if (mphf_fd == NULL) {