Initial revision

This commit is contained in:
davi
2004-12-23 13:16:30 +00:00
commit b3f008eb40
42 changed files with 3448 additions and 0 deletions

22
src/Makefile.am Normal file
View File

@@ -0,0 +1,22 @@
bin_PROGRAMS = cmph
lib_LTLIBRARIES = libcmph.la
include_HEADERS = cmph.h cmph_types.h
libcmph_la_SOURCES = debug.h\
cmph_types.h\
hash.h hash_state.h hash.c\
jenkins_hash.h jenkins_hash.c\
djb2_hash.h djb2_hash.c\
sdbm_hash.h sdbm_hash.c\
fnv_hash.h fnv_hash.c\
vstack.h vstack.c\
vqueue.h vqueue.c\
graph.h graph.c\
cmph.h cmph.c\
cmph_structs.h cmph_structs.c\
czech.h czech_structs.h czech.c\
bmz.h bmz_structs.h bmz.c
libcmph_la_LDFLAGS = -version-info 0:0:0
cmph_SOURCES = main.c
cmph_LDADD = libcmph.la

439
src/bmz.c Normal file
View File

@@ -0,0 +1,439 @@
#include "bmz.h"
#include "cmph_structs.h"
#include "bmz_structs.h"
#include "hash.h"
#include "vqueue.h"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <string.h>
#include <netinet/in.h>
//#define DEBUG
#include "debug.h"
static uint32 UNDEFINED = UINT_MAX;
static int bmz_gen_edges(mph_t *mph);
static void bmz_traverse_critical_nodes(bmz_mph_data_t *bmz, uint32 v, uint32 * biggest_g_value, uint32 * biggest_edge_value, uint8 * used_edges);
static void bmz_traverse_non_critical_nodes(bmz_mph_data_t *bmz, uint8 * used_edges);
mph_t *bmz_mph_new(key_source_t *key_source)
{
mph_t *mph = NULL;
bmz_mph_data_t *bmz = NULL;
mph = __mph_new(MPH_BMZ, key_source);
if (mph == NULL) return NULL;
bmz = (bmz_mph_data_t *)malloc(sizeof(bmz_mph_data_t));
if (bmz == NULL)
{
__mph_destroy(mph);
return NULL;
}
bmz->hashfuncs[0] = HASH_JENKINS;
bmz->hashfuncs[1] = HASH_JENKINS;
bmz->g = NULL;
bmz->graph = NULL;
bmz->hashes = NULL;
mph->data = bmz;
assert(mph->data);
return mph;
}
void bmz_mph_destroy(mph_t *mph)
{
bmz_mph_data_t *data = (bmz_mph_data_t *)mph->data;
DEBUGP("Destroying algorithm dependent data\n");
free(data);
__mph_destroy(mph);
}
void bmz_mph_set_hashfuncs(mph_t *mph, CMPH_HASH *hashfuncs)
{
bmz_mph_data_t *bmz = (bmz_mph_data_t *)mph->data;
CMPH_HASH *hashptr = hashfuncs;
uint32 i = 0;
while(*hashptr != HASH_COUNT)
{
if (i >= 2) break; //bmz only uses two hash functions
bmz->hashfuncs[i] = *hashptr;
++i, ++hashptr;
}
}
mphf_t *bmz_mph_create(mph_t *mph, float bmz_c)
{
mphf_t *mphf = NULL;
bmz_mphf_data_t *bmzf = NULL;
uint32 i;
uint32 iterations = 10;
uint8 *used_edges = NULL;
uint32 unused_edge_index = 0;
uint32 biggest_g_value = 0;
uint32 biggest_edge_value = 1;
DEBUGP("bmz_c: %f\n", bmz_c);
bmz_mph_data_t *bmz = (bmz_mph_data_t *)mph->data;
bmz->m = mph->key_source->nkeys;
bmz->n = ceil(bmz_c * mph->key_source->nkeys);
DEBUGP("m (edges): %u n (vertices): %u bmz_c: %f\n", bmz->m, bmz->n, bmz_c);
bmz->graph = graph_new(bmz->n, bmz->m);
DEBUGP("Created graph\n");
bmz->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*3);
for(i = 0; i < 3; ++i) bmz->hashes[i] = NULL;
// Mapping step
if (mph->verbosity)
{
fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", bmz->m, bmz->n);
}
while(1)
{
int ok;
DEBUGP("hash function 1\n");
bmz->hashes[0] = hash_state_new(bmz->hashfuncs[0], bmz->n);
DEBUGP("hash function 2\n");
bmz->hashes[1] = hash_state_new(bmz->hashfuncs[1], bmz->n);
DEBUGP("Generating edges\n");
ok = bmz_gen_edges(mph);
if (!ok)
{
--iterations;
hash_state_destroy(bmz->hashes[0]);
bmz->hashes[0] = NULL;
hash_state_destroy(bmz->hashes[1]);
bmz->hashes[1] = NULL;
DEBUGP("%u iterations remaining\n", iterations);
if (mph->verbosity)
{
fprintf(stderr, "simple graph creation failure - %u iterations remaining\n", iterations);
}
if (iterations == 0) break;
}
else break;
}
if (iterations == 0)
{
graph_destroy(bmz->graph);
return NULL;
}
// Ordering step
if (mph->verbosity)
{
fprintf(stderr, "Starting ordering step\n");
}
graph_obtain_critical_nodes(bmz->graph);
// Searching step
if (mph->verbosity)
{
fprintf(stderr, "Starting Searching step\n");
fprintf(stderr, "\tTraversing critical vertices.\n");
}
DEBUGP("Searching step\n");
used_edges = (uint8 *)malloc(bmz->m*sizeof(uint8));
memset(used_edges, 0, bmz->m);
free(bmz->g);
bmz->g = malloc(bmz->n * sizeof(uint32));
assert(bmz->g);
for (i = 0; i < bmz->n; ++i) bmz->g[i] = UNDEFINED;
for (i = 0; i < bmz->n; ++i) // critical nodes
{
if (graph_node_is_critical(bmz->graph, i) && (bmz->g[i] == UNDEFINED))
{
bmz_traverse_critical_nodes(bmz, i, &biggest_g_value, &biggest_edge_value, used_edges);
}
}
if (mph->verbosity)
{
fprintf(stderr, "\tTraversing non critical vertices.\n");
}
bmz_traverse_non_critical_nodes(bmz, used_edges); // non_critical_nodes
graph_destroy(bmz->graph);
free(used_edges);
bmz->graph = NULL;
mphf = (mphf_t *)malloc(sizeof(mphf_t));
mphf->algo = mph->algo;
bmzf = (bmz_mphf_data_t *)malloc(sizeof(bmz_mph_data_t));
bmzf->g = bmz->g;
bmz->g = NULL; //transfer memory ownership
bmzf->hashes = bmz->hashes;
bmz->hashes = NULL; //transfer memory ownership
bmzf->n = bmz->n;
bmzf->m = bmz->m;
mphf->data = bmzf;
mphf->size = bmz->m;
DEBUGP("Successfully generated minimal perfect hash\n");
if (mph->verbosity)
{
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
}
return mphf;
}
static void bmz_traverse_critical_nodes(bmz_mph_data_t *bmz, uint32 v, uint32 * biggest_g_value, uint32 * biggest_edge_value, uint8 * used_edges)
{
uint32 next_g;
uint32 u; /* Auxiliary vertex */
uint32 lav; /* lookahead vertex */
uint8 collision;
vqueue_t * q = vqueue_new(graph_ncritical_nodes(bmz->graph));
graph_iterator_t it, it1;
DEBUGP("Labelling critical vertices\n");
bmz->g[v] = (uint32)ceil ((double)(*biggest_edge_value)/2) - 1;
next_g = (uint32)floor((double)(*biggest_edge_value/2)); /* next_g is incremented in the do..while statement*/
*biggest_g_value = next_g;
vqueue_insert(q, v);
while(!vqueue_is_empty(q))
{
v = vqueue_remove(q);
it = graph_neighbors_it(bmz->graph, v);
while ((u = graph_next_neighbor(bmz->graph, &it)) != GRAPH_NO_NEIGHBOR)
{
if (graph_node_is_critical(bmz->graph, u) && (bmz->g[u] == UNDEFINED))
{
collision = 1;
while(collision) // lookahead to resolve collisions
{
next_g = *biggest_g_value + 1;
it1 = graph_neighbors_it(bmz->graph, u);
collision = 0;
while((lav = graph_next_neighbor(bmz->graph, &it1)) != GRAPH_NO_NEIGHBOR)
{
if (graph_node_is_critical(bmz->graph, lav) && (bmz->g[lav] != UNDEFINED))
{
assert(next_g + bmz->g[lav] < bmz->m);
if (used_edges[next_g + bmz->g[lav]])
{
collision = 1;
break;
}
}
}
if (next_g > *biggest_g_value) *biggest_g_value = next_g;
}
// Marking used edges...
it1 = graph_neighbors_it(bmz->graph, u);
while((lav = graph_next_neighbor(bmz->graph, &it1)) != GRAPH_NO_NEIGHBOR)
{
if (graph_node_is_critical(bmz->graph, lav) && (bmz->g[lav] != UNDEFINED))
{
used_edges[next_g + bmz->g[lav]] = 1;
if(next_g + bmz->g[lav] > *biggest_edge_value) *biggest_edge_value = next_g + bmz->g[lav];
}
}
bmz->g[u] = next_g; // Labelling vertex u.
vqueue_insert(q, u);
}
}
}
vqueue_destroy(q);
}
static uint32 next_unused_edge(bmz_mph_data_t *bmz, uint8 * used_edges, uint32 unused_edge_index)
{
while(1)
{
assert(unused_edge_index < bmz->m);
if(used_edges[unused_edge_index]) unused_edge_index ++;
else break;
}
return unused_edge_index;
}
static void bmz_traverse(bmz_mph_data_t *bmz, uint8 * used_edges, uint32 v, uint32 * unused_edge_index)
{
graph_iterator_t it = graph_neighbors_it(bmz->graph, v);
uint32 neighbor = 0;
while((neighbor = graph_next_neighbor(bmz->graph, &it)) != GRAPH_NO_NEIGHBOR)
{
DEBUGP("Visiting neighbor %u\n", neighbor);
if(bmz->g[neighbor] != UNDEFINED) continue;
*unused_edge_index = next_unused_edge(bmz, used_edges, *unused_edge_index + 1);
bmz->g[neighbor] = *unused_edge_index - bmz->g[v];
bmz_traverse(bmz, used_edges, neighbor, unused_edge_index);
}
}
static void bmz_traverse_non_critical_nodes(bmz_mph_data_t *bmz, uint8 * used_edges)
{
uint32 i, v1, v2, unused_edge_index = -1;
DEBUGP("Labelling non critical vertices\n");
for(i = 0; i < bmz->m; i++)
{
v1 = graph_vertex_id(bmz->graph, i, 0);
v2 = graph_vertex_id(bmz->graph, i, 1);
if((bmz->g[v1] != UNDEFINED && bmz->g[v2] != UNDEFINED) || (bmz->g[v1] == UNDEFINED && bmz->g[v2] == UNDEFINED)) continue;
if(bmz->g[v1] != UNDEFINED) bmz_traverse(bmz, used_edges, v1, &unused_edge_index);
else bmz_traverse(bmz, used_edges, v2, &unused_edge_index);
}
for(i = 0; i < bmz->n; i++)
{
if(bmz->g[i] == UNDEFINED)
{
bmz->g[i] = 0;
bmz_traverse(bmz, used_edges, i, &unused_edge_index);
}
}
}
static int bmz_gen_edges(mph_t *mph)
{
uint32 e;
bmz_mph_data_t *bmz = (bmz_mph_data_t *)mph->data;
uint8 multiple_edges = 0;
DEBUGP("Generating edges for %u vertices\n", bmz->n);
graph_clear_edges(bmz->graph);
mph->key_source->rewind(mph->key_source->data);
for (e = 0; e < mph->key_source->nkeys; ++e)
{
uint32 h1, h2;
uint32 keylen;
char *key;
mph->key_source->read(mph->key_source->data, &key, &keylen);
h1 = hash(bmz->hashes[0], key, keylen) % bmz->n;
h2 = hash(bmz->hashes[1], key, keylen) % bmz->n;
if (h1 == h2) if (++h2 >= bmz->n) h2 = 0;
if (h1 == h2)
{
if (mph->verbosity) fprintf(stderr, "Self loop for key %e\n", e);
mph->key_source->dispose(mph->key_source->data, key, keylen);
return 0;
}
DEBUGP("Adding edge: %u -> %u for key %s\n", h1, h2, key);
mph->key_source->dispose(mph->key_source->data, key, keylen);
multiple_edges = graph_contains_edge(bmz->graph, h1, h2);
if (mph->verbosity && multiple_edges) fprintf(stderr, "A non simple graph was generated\n");
if (multiple_edges) return 0; // checking multiple edge restriction.
graph_add_edge(bmz->graph, h1, h2);
}
return !multiple_edges;
}
int bmz_mphf_dump(mphf_t *mphf, FILE *fd)
{
char *buf = NULL;
uint32 buflen;
uint32 nbuflen;
uint32 i;
uint32 two = htonl(2); //number of hash functions
bmz_mphf_data_t *data = (bmz_mphf_data_t *)mphf->data;
uint32 nn, nm;
__mphf_dump(mphf, fd);
fwrite(&two, sizeof(uint32), 1, fd);
hash_state_dump(data->hashes[0], &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbuflen = htonl(buflen);
fwrite(&nbuflen, sizeof(uint32), 1, fd);
fwrite(buf, buflen, 1, fd);
free(buf);
hash_state_dump(data->hashes[1], &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbuflen = htonl(buflen);
fwrite(&nbuflen, sizeof(uint32), 1, fd);
fwrite(buf, buflen, 1, fd);
free(buf);
nn = htonl(data->n);
fwrite(&nn, sizeof(uint32), 1, fd);
nm = htonl(data->m);
fwrite(&nm, sizeof(uint32), 1, fd);
for (i = 0; i < data->n; ++i)
{
uint32 ng = htonl(data->g[i]);
fwrite(&ng, sizeof(uint32), 1, fd);
}
#ifdef DEBUG
fprintf(stderr, "G: ");
for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]);
fprintf(stderr, "\n");
#endif
return 1;
}
void bmz_mphf_load(FILE *f, mphf_t *mphf)
{
uint32 nhashes;
char fbuf[BUFSIZ];
char *buf = NULL;
uint32 buflen;
uint32 i;
hash_state_t *state;
bmz_mphf_data_t *bmz = (bmz_mphf_data_t *)malloc(sizeof(bmz_mphf_data_t));
DEBUGP("Loading bmz mphf\n");
mphf->data = bmz;
fread(&nhashes, sizeof(uint32), 1, f);
nhashes = ntohl(nhashes);
bmz->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*(nhashes + 1));
bmz->hashes[nhashes] = NULL;
DEBUGP("Reading %u hashes\n", nhashes);
for (i = 0; i < nhashes; ++i)
{
hash_state_t *state = NULL;
fread(&buflen, sizeof(uint32), 1, f);
buflen = ntohl(buflen);
DEBUGP("Hash state has %u bytes\n", buflen);
buf = (char *)malloc(buflen);
fread(buf, buflen, 1, f);
state = hash_state_load(buf, buflen);
bmz->hashes[i] = state;
free(buf);
}
DEBUGP("Reading m and n\n");
fread(&(bmz->n), sizeof(uint32), 1, f);
bmz->n = ntohl(bmz->n);
fread(&(bmz->m), sizeof(uint32), 1, f);
bmz->m = ntohl(bmz->m);
bmz->g = (uint32 *)malloc(sizeof(uint32)*bmz->n);
fread(bmz->g, bmz->n*sizeof(uint32), 1, f);
for (i = 0; i < bmz->n; ++i) bmz->g[i] = ntohl(bmz->g[i]);
#ifdef DEBUG
fprintf(stderr, "G: ");
for (i = 0; i < bmz->n; ++i) fprintf(stderr, "%u ", bmz->g[i]);
fprintf(stderr, "\n");
#endif
return;
}
uint32 bmz_mphf_search(mphf_t *mphf, const char *key, uint32 keylen)
{
bmz_mphf_data_t *bmz = mphf->data;
uint32 h1 = hash(bmz->hashes[0], key, keylen) % bmz->n;
uint32 h2 = hash(bmz->hashes[1], key, keylen) % bmz->n;
DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2);
if (h1 == h2 && ++h2 > bmz->n) h2 = 0;
DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, bmz->g[h1], bmz->g[h2], bmz->m);
return bmz->g[h1] + bmz->g[h2];
}
void bmz_mphf_destroy(mphf_t *mphf)
{
bmz_mphf_data_t *data = (bmz_mphf_data_t *)mphf->data;
free(data->g);
hash_state_destroy(data->hashes[0]);
hash_state_destroy(data->hashes[1]);
free(data->hashes);
free(data);
free(mphf);
}

18
src/bmz.h Normal file
View File

@@ -0,0 +1,18 @@
#ifndef __BMZ_H__
#define __BMZ_H__
#include "graph.h"
#include "cmph.h"
typedef struct __bmz_mphf_data_t bmz_mphf_data_t;
typedef struct __bmz_mph_data_t bmz_mph_data_t;
mph_t *bmz_mph_new(key_source_t *key_source);
void bmz_mph_set_hashfuncs(mph_t *mph, CMPH_HASH *hashfuncs);
void bmz_mph_destroy(mph_t *mph);
mphf_t *bmz_mph_create(mph_t *mph, float bmz_c);
void bmz_mphf_load(FILE *f, mphf_t *mphf);
int bmz_mphf_dump(mphf_t *mphf, FILE *f);
uint32 bmz_mphf_search(mphf_t *mphf, const char *key, uint32 keylen);
#endif

24
src/bmz_structs.h Normal file
View File

@@ -0,0 +1,24 @@
#ifndef __BMZ_STRUCTS_H__
#define __BMZ_STRUCTS_H__
#include "hash_state.h"
struct __bmz_mphf_data_t
{
uint32 m; //edges (words) count
uint32 n; //vertex count
uint32 *g;
hash_state_t **hashes;
};
struct __bmz_mph_data_t
{
CMPH_HASH hashfuncs[2];
uint32 m; //edges (words) count
uint32 n; //vertex count
graph_t *graph;
uint32 *g;
hash_state_t **hashes;
};
#endif

169
src/cmph.c Normal file
View File

@@ -0,0 +1,169 @@
#include "cmph.h"
#include "cmph_structs.h"
#include "czech.h"
#include "bmz.h"
//#include "bmz.h" /* included -- Fabiano */
#include <stdlib.h>
#include <assert.h>
//#define DEBUG
#include "debug.h"
const char *mph_names[] = { "czech", "bmz", NULL }; /* included -- Fabiano */
mph_t *mph_new(MPH_ALGO algo, key_source_t *key_source)
{
mph_t *mph = NULL;
DEBUGP("Creating mph with algorithm %s\n", mph_names[algo]);
switch (algo)
{
case MPH_CZECH:
mph = czech_mph_new(key_source);
break;
case MPH_BMZ: /* included -- Fabiano */
DEBUGP("new bmz algorithm \n");
mph = bmz_mph_new(key_source);
break;
default:
assert(0);
}
assert(mph);
return mph;
}
void mph_destroy(mph_t *mph)
{
DEBUGP("Destroying mph with algo %s\n", mph_names[mph->algo]);
switch (mph->algo)
{
case MPH_CZECH:
czech_mph_destroy(mph);
break;
case MPH_BMZ: /* included -- Fabiano */
bmz_mph_destroy(mph);
break;
default:
assert(0);
}
}
void mph_set_verbosity(mph_t *mph, uint32 verbosity)
{
mph->verbosity = verbosity;
}
void mph_set_hashfuncs(mph_t *mph, CMPH_HASH *hashfuncs)
{
switch (mph->algo)
{
case MPH_CZECH:
czech_mph_set_hashfuncs(mph, hashfuncs);
break;
case MPH_BMZ: /* included -- Fabiano */
bmz_mph_set_hashfuncs(mph, hashfuncs);
break;
default:
break;
}
return;
}
mphf_t *mph_create(mph_t *mph)
{
mphf_t *mphf = NULL;
switch (mph->algo)
{
case MPH_CZECH:
DEBUGP("Creating czech hash\n");
mphf = czech_mph_create(mph, 2.09);
break;
case MPH_BMZ: /* included -- Fabiano */
DEBUGP("Creating bmz hash\n");
mphf = bmz_mph_create(mph, 1.10);
break;
default:
assert(0);
}
return mphf;
}
int mphf_dump(mphf_t *mphf, FILE *f)
{
switch (mphf->algo)
{
case MPH_CZECH:
return czech_mphf_dump(mphf, f);
break;
case MPH_BMZ: /* included -- Fabiano */
return bmz_mphf_dump(mphf, f);
break;
default:
assert(0);
}
assert(0);
return 0;
}
mphf_t *mphf_load(FILE *f)
{
mphf_t *mphf = NULL;
DEBUGP("Loading mphf generic parts\n");
mphf = __mphf_load(f);
if (mphf == NULL) return NULL;
DEBUGP("Loading mphf algorithm dependent parts\n");
switch (mphf->algo)
{
case MPH_CZECH:
czech_mphf_load(f, mphf);
break;
case MPH_BMZ: /* included -- Fabiano */
DEBUGP("Loading bmz algorithm dependent parts\n");
bmz_mphf_load(f, mphf);
break;
default:
assert(0);
}
DEBUGP("Loaded mphf\n");
return mphf;
}
uint32 mphf_search(mphf_t *mphf, const char *key, uint32 keylen)
{
DEBUGP("mphf algorithm: %u \n", mphf->algo);
switch(mphf->algo)
{
case MPH_CZECH:
return czech_mphf_search(mphf, key, keylen);
case MPH_BMZ: /* included -- Fabiano */
DEBUGP("bmz algorithm search\n");
return bmz_mphf_search(mphf, key, keylen);
default:
assert(0);
}
assert(0);
return;
}
uint32 mphf_size(mphf_t *mphf)
{
return mphf->size;
}
void mphf_destroy(mphf_t *mphf)
{
switch(mphf->algo)
{
case MPH_CZECH:
czech_mphf_destroy(mphf);
return;
case MPH_BMZ: /* included -- Fabiano */
bmz_mphf_destroy(mphf);
return;
default:
assert(0);
}
assert(0);
return;
}

44
src/cmph.h Normal file
View File

@@ -0,0 +1,44 @@
#ifndef __CMPH_H__
#define __CMPH_H__
#include <stdlib.h>
#include <stdio.h>
#ifdef __cplusplus
extern "C"
{
#endif
#include "cmph_types.h"
typedef struct __mph_t mph_t;
typedef struct __mphf_t mphf_t;
typedef struct
{
void *data;
uint32 nkeys;
int (*read)(void *, char **, uint32 *);
void (*dispose)(void *, char *, uint32);
void (*rewind)(void *);
} key_source_t;
/** Hash generation API **/
mph_t *mph_new(MPH_ALGO algo, key_source_t *key_source);
void mph_set_hashfuncs(mph_t *mph, CMPH_HASH *hashfuncs);
void mph_set_verbosity(mph_t *mph, uint32 verbosity);
void mph_destroy(mph_t *mph);
mphf_t *mph_create(mph_t *mph);
/** Hash querying API **/
mphf_t *mphf_load(FILE *f);
int mphf_dump(mphf_t *mphf, FILE *f);
uint32 mphf_search(mphf_t *mphf, const char *key, uint32 keylen);
uint32 mphf_size(mphf_t *mphf);
void mphf_destroy(mphf_t *mphf);
#ifdef __cplusplus
}
#endif
#endif

68
src/cmph_structs.c Normal file
View File

@@ -0,0 +1,68 @@
#include "cmph_structs.h"
#include <string.h>
#define DEBUG
#include "debug.h"
mph_t *__mph_new(MPH_ALGO algo, key_source_t *key_source)
{
mph_t *mph = (mph_t *)malloc(sizeof(mph_t));
DEBUGP("Creating mph with algorithm %s\n", mph_names[algo]);
if (mph == NULL) return NULL;
mph->algo = algo;
mph->key_source = key_source;
mph->verbosity = 0;
return mph;
}
void __mph_destroy(mph_t *mph)
{
free(mph);
}
void __mphf_dump(mphf_t *mphf, FILE *fd)
{
uint32 nsize = htonl(mphf->size);
fwrite(mph_names[mphf->algo], strlen(mph_names[mphf->algo]) + 1, 1, fd);
fwrite(&nsize, sizeof(mphf->size), 1, fd);
}
mphf_t *__mphf_load(FILE *f)
{
mphf_t *mphf = NULL;
uint32 i;
char algo_name[BUFSIZ];
char *ptr = algo_name;
MPH_ALGO algo = MPH_COUNT;
DEBUGP("Loading mphf\n");
while(1)
{
uint32 c = fread(ptr, 1, 1, f);
if (c != 1) return NULL;
if (*ptr == 0) break;
++ptr;
}
for(i = 0; i < MPH_COUNT; ++i)
{
if (strcmp(algo_name, mph_names[i]) == 0)
{
algo = i;
}
}
if (algo == MPH_COUNT)
{
DEBUGP("Algorithm %s not found\n", algo_name);
return NULL;
}
mphf = (mphf_t *)malloc(sizeof(mphf_t));
mphf->algo = algo;
fread(&(mphf->size), sizeof(mphf->size), 1, f);
mphf->size = ntohl(mphf->size);
mphf->data = NULL;
DEBUGP("Algorithm is %s and mphf is sized %u\n", mph_names[algo], mphf->size);
return mphf;
}

32
src/cmph_structs.h Normal file
View File

@@ -0,0 +1,32 @@
#ifndef __CMPH_STRUCTS_H__
#define __CMPH_STRUCTS_H__
#include "cmph.h"
/** Hash generation algorithm data
*/
struct __mph_t
{
MPH_ALGO algo;
key_source_t *key_source;
uint32 verbosity;
void *data; //algorithm dependent data
};
/** Hash querying algorithm data
*/
struct __mphf_t
{
MPH_ALGO algo;
uint32 size;
key_source_t *key_source;
void *data; //algorithm dependent data
};
mph_t *__mph_new(MPH_ALGO algo, key_source_t *key_source);
void __mph_destroy();
void __mphf_dump(mphf_t *mphf, FILE *);
mphf_t *__mphf_load(FILE *f);
#endif

13
src/cmph_types.h Normal file
View File

@@ -0,0 +1,13 @@
#ifndef __CMPH_TYPES_H__
#define __CMPH_TYPES_H__
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint32;
typedef enum { HASH_JENKINS, HASH_DJB2, HASH_SDBM, HASH_FNV, HASH_GLIB, HASH_PJW, HASH_COUNT } CMPH_HASH;
extern const char *hash_names[];
typedef enum { MPH_CZECH, MPH_BMZ, MPH_COUNT } MPH_ALGO; /* included -- Fabiano */
extern const char *mph_names[];
#endif

320
src/czech.c Normal file
View File

@@ -0,0 +1,320 @@
#include "czech.h"
#include "cmph_structs.h"
#include "czech_structs.h"
#include "hash.h"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <string.h>
#include <netinet/in.h>
//#define DEBUG
#include "debug.h"
static int czech_gen_edges(mph_t *mph);
static void czech_traverse(czech_mph_data_t *czech, char *visited, uint32 v);
mph_t *czech_mph_new(key_source_t *key_source)
{
mph_t *mph = NULL;
czech_mph_data_t *czech = NULL;
mph = __mph_new(MPH_CZECH, key_source);
if (mph == NULL) return NULL;
czech = (czech_mph_data_t *)malloc(sizeof(czech_mph_data_t));
if (czech == NULL)
{
__mph_destroy(mph);
return NULL;
}
czech->hashfuncs[0] = HASH_JENKINS;
czech->hashfuncs[1] = HASH_JENKINS;
czech->g = NULL;
czech->graph = NULL;
czech->hashes = NULL;
mph->data = czech;
assert(mph->data);
return mph;
}
void czech_mph_destroy(mph_t *mph)
{
czech_mph_data_t *data = (czech_mph_data_t *)mph->data;
DEBUGP("Destroying algorithm dependent data\n");
free(data);
__mph_destroy(mph);
}
void czech_mph_set_hashfuncs(mph_t *mph, CMPH_HASH *hashfuncs)
{
czech_mph_data_t *czech = (czech_mph_data_t *)mph->data;
CMPH_HASH *hashptr = hashfuncs;
uint32 i = 0;
while(*hashptr != HASH_COUNT)
{
if (i >= 2) break; //czech only uses two hash functions
czech->hashfuncs[i] = *hashptr;
++i, ++hashptr;
}
}
mphf_t *czech_mph_create(mph_t *mph, float c)
{
mphf_t *mphf = NULL;
czech_mphf_data_t *czechf = NULL;
uint32 i;
uint32 iterations = 10;
char *visited = NULL;
czech_mph_data_t *czech = (czech_mph_data_t *)mph->data;
czech->m = mph->key_source->nkeys;
czech->n = ceil(c * mph->key_source->nkeys);
DEBUGP("m (edges): %u n (vertices): %u c: %f\n", czech->m, czech->n, c);
czech->graph = graph_new(czech->n, czech->m);
DEBUGP("Created graph\n");
czech->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*3);
for(i = 0; i < 3; ++i) czech->hashes[i] = NULL;
//Mapping step
if (mph->verbosity)
{
fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", czech->m, czech->n);
}
while(1)
{
int ok;
czech->hashes[0] = hash_state_new(czech->hashfuncs[0], czech->n);
czech->hashes[1] = hash_state_new(czech->hashfuncs[1], czech->n);
ok = czech_gen_edges(mph);
if (!ok)
{
--iterations;
hash_state_destroy(czech->hashes[0]);
czech->hashes[0] = NULL;
hash_state_destroy(czech->hashes[1]);
czech->hashes[1] = NULL;
DEBUGP("%u iterations remaining\n", iterations);
if (mph->verbosity)
{
fprintf(stderr, "Acyclic graph creation failure - %u iterations remaining\n", iterations);
}
if (iterations == 0) break;
}
else break;
}
if (iterations == 0)
{
graph_destroy(czech->graph);
return NULL;
}
//Assignment step
if (mph->verbosity)
{
fprintf(stderr, "Starting assignment step\n");
}
DEBUGP("Assignment step\n");
visited = (char *)malloc(czech->n);
memset(visited, 0, czech->n);
free(czech->g);
czech->g = malloc(czech->n * sizeof(uint32));
assert(czech->g);
for (i = 0; i < czech->n; ++i)
{
if (!visited[i])
{
czech->g[i] = 0;
czech_traverse(czech, visited, i);
}
}
graph_destroy(czech->graph);
free(visited);
czech->graph = NULL;
mphf = (mphf_t *)malloc(sizeof(mphf_t));
mphf->algo = mph->algo;
czechf = (czech_mphf_data_t *)malloc(sizeof(czech_mph_data_t));
czechf->g = czech->g;
czech->g = NULL; //transfer memory ownership
czechf->hashes = czech->hashes;
czech->hashes = NULL; //transfer memory ownership
czechf->n = czech->n;
czechf->m = czech->m;
mphf->data = czechf;
mphf->size = czech->m;
DEBUGP("Successfully generated minimal perfect hash\n");
if (mph->verbosity)
{
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
}
return mphf;
}
static void czech_traverse(czech_mph_data_t *czech, char *visited, uint32 v)
{
graph_iterator_t it = graph_neighbors_it(czech->graph, v);
uint32 neighbor = 0;
visited[v] = 1;
DEBUGP("Visiting vertex %u\n", v);
while((neighbor = graph_next_neighbor(czech->graph, &it)) != GRAPH_NO_NEIGHBOR)
{
DEBUGP("Visiting neighbor %u\n", neighbor);
if(visited[neighbor]) continue;
DEBUGP("Visiting neighbor %u\n", neighbor);
DEBUGP("Visiting edge %u->%u with id %u\n", v, neighbor, graph_edge_id(czech->graph, v, neighbor));
czech->g[neighbor] = graph_edge_id(czech->graph, v, neighbor) - czech->g[v];
DEBUGP("g is %u (%u - %u mod %u)\n", czech->g[neighbor], graph_edge_id(czech->graph, v, neighbor), czech->g[v], czech->m);
czech_traverse(czech, visited, neighbor);
}
}
static int czech_gen_edges(mph_t *mph)
{
uint32 e;
czech_mph_data_t *czech = (czech_mph_data_t *)mph->data;
int cycles = 0;
DEBUGP("Generating edges for %u vertices\n", czech->n);
graph_clear_edges(czech->graph);
mph->key_source->rewind(mph->key_source->data);
for (e = 0; e < mph->key_source->nkeys; ++e)
{
uint32 h1, h2;
uint32 keylen;
char *key;
mph->key_source->read(mph->key_source->data, &key, &keylen);
h1 = hash(czech->hashes[0], key, keylen) % czech->n;
h2 = hash(czech->hashes[1], key, keylen) % czech->n;
if (h1 == h2) if (++h2 >= czech->n) h2 = 0;
if (h1 == h2)
{
if (mph->verbosity) fprintf(stderr, "Self loop for key %e\n", e);
mph->key_source->dispose(mph->key_source->data, key, keylen);
return 0;
}
DEBUGP("Adding edge: %u -> %u for key %s\n", h1, h2, key);
mph->key_source->dispose(mph->key_source->data, key, keylen);
graph_add_edge(czech->graph, h1, h2);
}
cycles = graph_is_cyclic(czech->graph);
if (mph->verbosity && cycles) fprintf(stderr, "Cyclic graph generated\n");
DEBUGP("Looking for cycles: %u\n", cycles);
return ! cycles;
}
int czech_mphf_dump(mphf_t *mphf, FILE *fd)
{
char *buf = NULL;
uint32 buflen;
uint32 nbuflen;
uint32 i;
uint32 two = htonl(2); //number of hash functions
czech_mphf_data_t *data = (czech_mphf_data_t *)mphf->data;
uint32 nn, nm;
__mphf_dump(mphf, fd);
fwrite(&two, sizeof(uint32), 1, fd);
hash_state_dump(data->hashes[0], &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbuflen = htonl(buflen);
fwrite(&nbuflen, sizeof(uint32), 1, fd);
fwrite(buf, buflen, 1, fd);
free(buf);
hash_state_dump(data->hashes[1], &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbuflen = htonl(buflen);
fwrite(&nbuflen, sizeof(uint32), 1, fd);
fwrite(buf, buflen, 1, fd);
free(buf);
nn = htonl(data->n);
fwrite(&nn, sizeof(uint32), 1, fd);
nm = htonl(data->m);
fwrite(&nm, sizeof(uint32), 1, fd);
for (i = 0; i < data->n; ++i)
{
uint32 ng = htonl(data->g[i]);
fwrite(&ng, sizeof(uint32), 1, fd);
}
#ifdef DEBUG
fprintf(stderr, "G: ");
for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]);
fprintf(stderr, "\n");
#endif
return 1;
}
void czech_mphf_load(FILE *f, mphf_t *mphf)
{
uint32 nhashes;
char fbuf[BUFSIZ];
char *buf = NULL;
uint32 buflen;
uint32 i;
hash_state_t *state;
czech_mphf_data_t *czech = (czech_mphf_data_t *)malloc(sizeof(czech_mphf_data_t));
DEBUGP("Loading czech mphf\n");
mphf->data = czech;
fread(&nhashes, sizeof(uint32), 1, f);
nhashes = ntohl(nhashes);
czech->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*(nhashes + 1));
czech->hashes[nhashes] = NULL;
DEBUGP("Reading %u hashes\n", nhashes);
for (i = 0; i < nhashes; ++i)
{
hash_state_t *state = NULL;
fread(&buflen, sizeof(uint32), 1, f);
buflen = ntohl(buflen);
DEBUGP("Hash state has %u bytes\n", buflen);
buf = (char *)malloc(buflen);
fread(buf, buflen, 1, f);
state = hash_state_load(buf, buflen);
czech->hashes[i] = state;
free(buf);
}
DEBUGP("Reading m and n\n");
fread(&(czech->n), sizeof(uint32), 1, f);
czech->n = ntohl(czech->n);
fread(&(czech->m), sizeof(uint32), 1, f);
czech->m = ntohl(czech->m);
czech->g = (uint32 *)malloc(sizeof(uint32)*czech->n);
fread(czech->g, czech->n*sizeof(uint32), 1, f);
for (i = 0; i < czech->n; ++i) czech->g[i] = ntohl(czech->g[i]);
#ifdef DEBUG
fprintf(stderr, "G: ");
for (i = 0; i < czech->n; ++i) fprintf(stderr, "%u ", czech->g[i]);
fprintf(stderr, "\n");
#endif
return;
}
uint32 czech_mphf_search(mphf_t *mphf, const char *key, uint32 keylen)
{
czech_mphf_data_t *czech = mphf->data;
uint32 h1 = hash(czech->hashes[0], key, keylen) % czech->n;
uint32 h2 = hash(czech->hashes[1], key, keylen) % czech->n;
DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2);
if (h1 == h2 && ++h2 > czech->n) h2 = 0;
DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, czech->g[h1], czech->g[h2], czech->m);
return (czech->g[h1] + czech->g[h2]) % czech->m;
}
void czech_mphf_destroy(mphf_t *mphf)
{
czech_mphf_data_t *data = (czech_mphf_data_t *)mphf->data;
free(data->g);
hash_state_destroy(data->hashes[0]);
hash_state_destroy(data->hashes[1]);
free(data->hashes);
free(data);
free(mphf);
}

18
src/czech.h Normal file
View File

@@ -0,0 +1,18 @@
#ifndef __CZECH_H__
#define __CZECH_H__
#include "graph.h"
#include "cmph.h"
typedef struct __czech_mphf_data_t czech_mphf_data_t;
typedef struct __czech_mph_data_t czech_mph_data_t;
mph_t *czech_mph_new(key_source_t *key_source);
void czech_mph_set_hashfuncs(mph_t *mph, CMPH_HASH *hashfuncs);
void czech_mph_destroy(mph_t *mph);
mphf_t *czech_mph_create(mph_t *mph, float c);
void czech_mphf_load(FILE *f, mphf_t *mphf);
int czech_mphf_dump(mphf_t *mphf, FILE *f);
uint32 czech_mphf_search(mphf_t *mphf, const char *key, uint32 keylen);
#endif

24
src/czech_structs.h Normal file
View File

@@ -0,0 +1,24 @@
#ifndef __CZECH_STRUCTS_H__
#define __CZECH_STRUCTS_H__
#include "hash_state.h"
struct __czech_mphf_data_t
{
uint32 m; //edges (words) count
uint32 n; //vertex count
uint32 *g;
hash_state_t **hashes;
};
struct __czech_mph_data_t
{
CMPH_HASH hashfuncs[2];
uint32 m; //edges (words) count
uint32 n; //vertex count
graph_t *graph;
uint32 *g;
hash_state_t **hashes;
};
#endif

15
src/debug.h Normal file
View File

@@ -0,0 +1,15 @@
#ifndef __MY_DEBUGC__
#define __MY_DEBUGC__
#ifdef __cplusplus
#include <cstdio>
#else
#include <stdio.h>
#endif
#ifdef DEBUG
#define DEBUGP(args...) do { fprintf(stderr, "%s:%d ", __FILE__, __LINE__); fprintf(stderr, ## args); } while(0)
#else
#define DEBUGP(args...)
#endif
#endif

42
src/djb2_hash.c Normal file
View File

@@ -0,0 +1,42 @@
#include "djb2_hash.h"
#include <stdlib.h>
djb2_state_t *djb2_state_new()
{
djb2_state_t *state = (djb2_state_t *)malloc(sizeof(djb2_state_t));
state->hashfunc = HASH_DJB2;
return state;
}
void djb2_state_destroy(djb2_state_t *state)
{
free(state);
}
uint32 djb2_hash(djb2_state_t *state, const char *k, uint32 keylen)
{
register unsigned int hash = 5381;
const unsigned char *ptr = k;
int i = 0;
while (i < keylen)
{
hash = hash*33 ^ *ptr;
++ptr, ++i;
}
return hash;
}
void djb2_state_dump(djb2_state_t *state, char **buf, uint32 *buflen)
{
*buf = NULL;
*buflen = 0;
return;
}
djb2_state_t *djb2_state_load(const char *buf, uint32 buflen)
{
djb2_state_t *state = (djb2_state_t *)malloc(sizeof(djb2_state_t));
state->hashfunc = HASH_DJB2;
return state;
}

17
src/djb2_hash.h Normal file
View File

@@ -0,0 +1,17 @@
#ifndef __DJB2_HASH_H__
#define __DJB2_HASH_H__
#include "hash.h"
typedef struct __djb2_state_t
{
CMPH_HASH hashfunc;
} djb2_state_t;
djb2_state_t *djb2_state_new();
uint32 djb2_hash(djb2_state_t *state, const char *k, uint32 keylen);
void djb2_state_dump(djb2_state_t *state, char **buf, uint32 *buflen);
djb2_state_t *djb2_state_load(const char *buf, uint32 buflen);
void djb2_state_destroy(djb2_state_t *state);
#endif

46
src/fnv_hash.c Normal file
View File

@@ -0,0 +1,46 @@
#include "fnv_hash.h"
#include <stdlib.h>
fnv_state_t *fnv_state_new()
{
fnv_state_t *state = (fnv_state_t *)malloc(sizeof(fnv_state_t));
state->hashfunc = HASH_FNV;
return state;
}
void fnv_state_destroy(fnv_state_t *state)
{
free(state);
}
uint32 fnv_hash(fnv_state_t *state, const char *k, uint32 keylen)
{
const unsigned char *bp = (const unsigned char *)k;
const unsigned char *be = bp + keylen;
static unsigned int hval = 0;
while (bp < be)
{
//hval *= 0x01000193; good for non-gcc compiler
hval += (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24); //good for gcc
hval ^= *bp++;
}
return hval;
}
void fnv_state_dump(fnv_state_t *state, char **buf, uint32 *buflen)
{
*buf = NULL;
*buflen = 0;
return;
}
fnv_state_t *fnv_state_load(const char *buf, uint32 buflen)
{
fnv_state_t *state = (fnv_state_t *)malloc(sizeof(fnv_state_t));
state->hashfunc = HASH_FNV;
return state;
}

17
src/fnv_hash.h Normal file
View File

@@ -0,0 +1,17 @@
#ifndef __FNV_HASH_H__
#define __FNV_HASH_H__
#include "hash.h"
typedef struct __fnv_state_t
{
CMPH_HASH hashfunc;
} fnv_state_t;
fnv_state_t *fnv_state_new();
uint32 fnv_hash(fnv_state_t *state, const char *k, uint32 keylen);
void fnv_state_dump(fnv_state_t *state, char **buf, uint32 *buflen);
fnv_state_t *fnv_state_load(const char *buf, uint32 buflen);
void fnv_state_destroy(fnv_state_t *state);
#endif

329
src/graph.c Normal file
View File

@@ -0,0 +1,329 @@
#include "graph.h"
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <assert.h>
#include <string.h>
#include "vstack.h"
//#define DEBUG
#include "debug.h"
#define abs_edge(e, i) (e % g->nedges + i * g->nedges)
struct __graph_t
{
uint32 nnodes;
uint32 nedges;
uint32 *edges;
uint32 *first;
uint32 *next;
uint8 *critical_nodes; /* included -- Fabiano*/
uint32 ncritical_nodes; /* included -- Fabiano*/
uint32 cedges;
int shrinking;
};
static uint32 EMPTY = UINT_MAX;
graph_t *graph_new(uint32 nnodes, uint32 nedges)
{
graph_t *graph = (graph_t *)malloc(sizeof(graph_t));
if (!graph) return NULL;
graph->edges = (uint32 *)malloc(sizeof(uint32) * 2 * nedges);
graph->next = (uint32 *)malloc(sizeof(uint32) * 2 * nedges);
graph->first = (uint32 *)malloc(sizeof(uint32) * nnodes);
graph->critical_nodes = NULL; /* included -- Fabiano*/
graph->ncritical_nodes = 0; /* included -- Fabiano*/
graph->nnodes = nnodes;
graph->nedges = nedges;
graph_clear_edges(graph);
return graph;
}
void graph_destroy(graph_t *graph)
{
DEBUGP("Destroying graph\n");
free(graph->edges);
free(graph->first);
free(graph->next);
free(graph->critical_nodes); /* included -- Fabiano*/
free(graph);
return;
}
void graph_print(graph_t *g)
{
uint32 i, e;
for (i = 0; i < g->nnodes; ++i)
{
DEBUGP("Printing edges connected to %u\n", i);
e = g->first[i];
if (e != EMPTY)
{
printf("%u -> %u\n", g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)]);
while ((e = g->next[e]) != EMPTY)
{
printf("%u -> %u\n", g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)]);
}
}
}
return;
}
void graph_add_edge(graph_t *g, uint32 v1, uint32 v2)
{
uint32 e = g->cedges;
assert(v1 < g->nnodes);
assert(v2 < g->nnodes);
assert(e < g->nedges);
assert(!g->shrinking);
g->next[e] = g->first[v1];
g->first[v1] = e;
g->edges[e] = v2;
g->next[e + g->nedges] = g->first[v2];
g->first[v2] = e + g->nedges;
g->edges[e + g->nedges] = v1;
++(g->cedges);
}
static int check_edge(graph_t *g, uint32 e, uint32 v1, uint32 v2)
{
DEBUGP("Checking edge %u %u looking for %u %u\n", g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)], v1, v2);
if (g->edges[abs_edge(e, 0)] == v1 && g->edges[abs_edge(e, 1)] == v2) return 1;
if (g->edges[abs_edge(e, 0)] == v2 && g->edges[abs_edge(e, 1)] == v1) return 1;
return 0;
}
uint32 graph_edge_id(graph_t *g, uint32 v1, uint32 v2)
{
uint32 e;
e = g->first[v1];
assert(e != EMPTY);
if (check_edge(g, e, v1, v2)) return abs_edge(e, 0);
do
{
e = g->next[e];
assert(e != EMPTY);
}
while (!check_edge(g, e, v1, v2));
return abs_edge(e, 0);
}
static void del_edge_point(graph_t *g, uint32 v1, uint32 v2)
{
uint32 e, prev;
DEBUGP("Deleting edge point %u %u\n", v1, v2);
e = g->first[v1];
if (check_edge(g, e, v1, v2))
{
g->first[v1] = g->next[e];
//g->edges[e] = EMPTY;
DEBUGP("Deleted\n");
return;
}
DEBUGP("Checking linked list\n");
do
{
prev = e;
e = g->next[e];
assert(e != EMPTY);
}
while (!check_edge(g, e, v1, v2));
g->next[prev] = g->next[e];
//g->edges[e] = EMPTY;
DEBUGP("Deleted\n");
}
void graph_del_edge(graph_t *g, uint32 v1, uint32 v2)
{
g->shrinking = 1;
del_edge_point(g, v1, v2);
del_edge_point(g, v2, v1);
}
void graph_clear_edges(graph_t *g)
{
uint32 i;
for (i = 0; i < g->nnodes; ++i) g->first[i] = EMPTY;
for (i = 0; i < g->nedges*2; ++i)
{
g->edges[i] = EMPTY;
g->next[i] = EMPTY;
}
g->cedges = 0;
g->shrinking = 0;
}
static int find_degree1_edge(graph_t *g, uint32 v, char *deleted, uint32 *e)
{
uint32 edge = g->first[v];
char found = 0;
DEBUGP("Checking degree of vertex %u\n", v);
if (edge == EMPTY) return 0;
else if (!deleted[abs_edge(edge, 0)])
{
found = 1;
*e = edge;
}
while(1)
{
edge = g->next[edge];
if (edge == EMPTY) break;
if (deleted[abs_edge(edge, 0)]) continue;
if (found) return 0;
DEBUGP("Found first edge\n");
*e = edge;
found = 1;
}
return found;
}
static void cyclic_del_edge(graph_t *g, uint32 v, char *deleted)
{
uint32 e;
char degree1;
uint32 v1 = v;
uint32 v2 = 0;
degree1 = find_degree1_edge(g, v1, deleted, &e);
if (!degree1) return;
while(1)
{
DEBUGP("Deleting edge %u (%u->%u)\n", e, g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)]);
deleted[abs_edge(e, 0)] = 1;
v2 = g->edges[abs_edge(e, 0)];
if (v2 == v1) v2 = g->edges[abs_edge(e, 1)];
DEBUGP("Checking if second endpoint %u has degree 1\n", v2);
degree1 = find_degree1_edge(g, v2, deleted, &e);
if (degree1)
{
DEBUGP("Inspecting vertex %u\n", v2);
v1 = v2;
}
else break;
}
}
int graph_is_cyclic(graph_t *g)
{
uint32 i;
uint32 v;
char *deleted = (char *)malloc(g->nedges*sizeof(char));
memset(deleted, 0, g->nedges);
DEBUGP("Looking for cycles in graph with %u vertices and %u edges\n", g->nnodes, g->nedges);
for (v = 0; v < g->nnodes; ++v)
{
cyclic_del_edge(g, v, deleted);
}
for (i = 0; i < g->nedges; ++i)
{
if (!(deleted[i]))
{
DEBUGP("Edge %u %u->%u was not deleted\n", i, g->edges[i], g->edges[i + g->nedges]);
free(deleted);
return 1;
}
}
free(deleted);
return 0;
}
uint8 graph_node_is_critical(graph_t * g, uint32 v) /* included -- Fabiano */
{
return g->critical_nodes[v];
}
void graph_obtain_critical_nodes(graph_t *g) /* included -- Fabiano*/
{
uint32 i;
uint32 v;
char *deleted = (char *)malloc(g->nedges*sizeof(char));
memset(deleted, 0, g->nedges);
/* g->critical_nodes = (uint8 *)malloc((size_t)(ceil(g->nnodes*sizeof(uint8)/8.))); */
g->critical_nodes = (uint8 *)malloc(g->nnodes*sizeof(uint8));
g->ncritical_nodes = 0;
DEBUGP("Looking for the 2-core in graph with %u vertices and %u edges\n", g->nnodes, g->nedges);
for (v = 0; v < g->nnodes; ++v)
{
cyclic_del_edge(g, v, deleted);
}
for (i = 0; i < g->nedges; ++i)
{
if (!(deleted[i]))
{
DEBUGP("Edge %u %u->%u belongs to the 2-core\n", i, g->edges[i], g->edges[i + g->nedges]);
if(!(g->critical_nodes[g->edges[i]]))
{
g->ncritical_nodes ++;
g->critical_nodes[g->edges[i]] = 1;
}
if(!(g->critical_nodes[g->edges[i + g->nedges]]))
{
g->ncritical_nodes ++;
g->critical_nodes[g->edges[i + g->nedges]] = 1;
}
}
}
free(deleted);
}
uint8 graph_contains_edge(graph_t *g, uint32 v1, uint32 v2) /* included -- Fabiano*/
{
uint32 e;
e = g->first[v1];
if(e == EMPTY) return 0;
if (check_edge(g, e, v1, v2)) return 1;
do
{
e = g->next[e];
if(e == EMPTY) return 0;
}
while (!check_edge(g, e, v1, v2));
return 1;
}
uint32 graph_vertex_id(graph_t *g, uint32 e, uint32 id) /* included -- Fabiano*/
{
return (g->edges[e + id*g->nedges]);
}
uint32 graph_ncritical_nodes(graph_t *g) /* included -- Fabiano*/
{
return g->ncritical_nodes;
}
graph_iterator_t graph_neighbors_it(graph_t *g, uint32 v)
{
graph_iterator_t it;
it.vertex = v;
it.edge = g->first[v];
return it;
}
uint32 graph_next_neighbor(graph_t *g, graph_iterator_t* it)
{
uint32 ret;
if(it->edge == EMPTY) return GRAPH_NO_NEIGHBOR;
if (g->edges[it->edge] == it->vertex) ret = g->edges[it->edge + g->nedges];
else ret = g->edges[it->edge];
it->edge = g->next[it->edge];
return ret;
}

39
src/graph.h Normal file
View File

@@ -0,0 +1,39 @@
#ifndef _CMPH_GRAPH_H__
#define _CMPH_GRAPH_H__
#include <limits.h>
#include "cmph_types.h"
#define GRAPH_NO_NEIGHBOR UINT_MAX
typedef struct __graph_t graph_t;
typedef struct __graph_iterator_t graph_iterator_t;
struct __graph_iterator_t
{
uint32 vertex;
uint32 edge;
};
graph_t *graph_new(uint32 nnodes, uint32 nedges);
void graph_destroy(graph_t *graph);
void graph_add_edge(graph_t *g, uint32 v1, uint32 v2);
//void graph_del_edge(graph_t *g, uint32 v1, uint32 v2);
void graph_clear_edges(graph_t *g);
uint32 graph_edge_id(graph_t *g, uint32 v1, uint32 v2);
graph_iterator_t graph_neighbors_it(graph_t *g, uint32 v);
uint32 graph_next_neighbor(graph_t *g, graph_iterator_t* it);
void graph_obtain_critical_nodes(graph_t *g); /* included -- Fabiano*/
uint8 graph_node_is_critical(graph_t * g, uint32 v); /* included -- Fabiano */
uint32 graph_ncritical_nodes(graph_t *g); /* included -- Fabiano*/
uint32 graph_vertex_id(graph_t *g, uint32 e, uint32 id); /* included -- Fabiano*/
int graph_is_cyclic(graph_t *g);
void graph_print(graph_t *);
#endif

139
src/hash.c Normal file
View File

@@ -0,0 +1,139 @@
#include "hash_state.h"
#include <stdlib.h>
#include <assert.h>
#include <limits.h>
#include <string.h>
//#define DEBUG
#include "debug.h"
const char *hash_names[] = { "jenkins", "djb2", "sdbm", "fnv", "glib", "pjw", NULL };
hash_state_t *hash_state_new(CMPH_HASH hashfunc, uint32 hashsize)
{
hash_state_t *state = NULL;
switch (hashfunc)
{
case HASH_JENKINS:
DEBUGP("Jenkins function - %u\n", hashsize);
state = (hash_state_t *)jenkins_state_new(hashsize);
DEBUGP("Jenkins function created\n");
break;
case HASH_DJB2:
state = (hash_state_t *)djb2_state_new();
break;
case HASH_SDBM:
state = (hash_state_t *)sdbm_state_new();
break;
case HASH_FNV:
state = (hash_state_t *)fnv_state_new();
break;
default:
assert(0);
}
state->hashfunc = hashfunc;
return state;
}
uint32 hash(hash_state_t *state, const char *key, uint32 keylen)
{
switch (state->hashfunc)
{
case HASH_JENKINS:
return jenkins_hash((jenkins_state_t *)state, key, keylen);
case HASH_DJB2:
return djb2_hash((djb2_state_t *)state, key, keylen);
case HASH_SDBM:
return sdbm_hash((sdbm_state_t *)state, key, keylen);
case HASH_FNV:
return fnv_hash((fnv_state_t *)state, key, keylen);
default:
assert(0);
}
assert(0);
return 0;
}
void hash_state_dump(hash_state_t *state, char **buf, uint32 *buflen)
{
char *algobuf;
switch (state->hashfunc)
{
case HASH_JENKINS:
jenkins_state_dump((jenkins_state_t *)state, &algobuf, buflen);
if (*buflen == UINT_MAX) return;
break;
case HASH_DJB2:
djb2_state_dump((djb2_state_t *)state, &algobuf, buflen);
if (*buflen == UINT_MAX) return;
break;
case HASH_SDBM:
sdbm_state_dump((sdbm_state_t *)state, &algobuf, buflen);
if (*buflen == UINT_MAX) return;
break;
case HASH_FNV:
fnv_state_dump((fnv_state_t *)state, &algobuf, buflen);
if (*buflen == UINT_MAX) return;
break;
default:
assert(0);
}
*buf = malloc(strlen(hash_names[state->hashfunc]) + 1 + *buflen);
memcpy(*buf, hash_names[state->hashfunc], strlen(hash_names[state->hashfunc]) + 1);
DEBUGP("Algobuf is %u\n", *(uint32 *)algobuf);
memcpy(*buf + strlen(hash_names[state->hashfunc]) + 1, algobuf, *buflen);
*buflen = strlen(hash_names[state->hashfunc]) + 1 + *buflen;
free(algobuf);
return;
}
hash_state_t *hash_state_load(const char *buf, uint32 buflen)
{
uint32 i;
uint32 offset;
CMPH_HASH hashfunc = HASH_COUNT;
for (i = 0; i < HASH_COUNT; ++i)
{
if (strcmp(buf, hash_names[i]) == 0)
{
hashfunc = i;
break;
}
}
if (hashfunc == HASH_COUNT) return NULL;
offset = strlen(hash_names[hashfunc]) + 1;
switch (hashfunc)
{
case HASH_JENKINS:
return (hash_state_t *)jenkins_state_load(buf + offset, buflen - offset);
case HASH_DJB2:
return (hash_state_t *)djb2_state_load(buf + offset, buflen - offset);
case HASH_SDBM:
return (hash_state_t *)sdbm_state_load(buf + offset, buflen - offset);
case HASH_FNV:
return (hash_state_t *)fnv_state_load(buf + offset, buflen - offset);
default:
return NULL;
}
return NULL;
}
void hash_state_destroy(hash_state_t *state)
{
switch (state->hashfunc)
{
case HASH_JENKINS:
jenkins_state_destroy((jenkins_state_t *)state);
break;
case HASH_DJB2:
djb2_state_destroy((djb2_state_t *)state);
break;
case HASH_SDBM:
sdbm_state_destroy((sdbm_state_t *)state);
break;
case HASH_FNV:
fnv_state_destroy((fnv_state_t *)state);
break;
default:
assert(0);
}
return;
}

14
src/hash.h Normal file
View File

@@ -0,0 +1,14 @@
#ifndef __CMPH_HASH_H__
#define __CMPH_HASH_H__
#include "cmph_types.h"
typedef union __hash_state_t hash_state_t;
hash_state_t *hash_state_new(CMPH_HASH, uint32 hashsize);
uint32 hash(hash_state_t *state, const char *key, uint32 keylen);
void hash_state_dump(hash_state_t *state, char **buf, uint32 *buflen);
hash_state_t *hash_state_load(const char *buf, uint32 buflen);
void hash_state_destroy(hash_state_t *state);
#endif

18
src/hash_state.h Normal file
View File

@@ -0,0 +1,18 @@
#ifndef __HASH_STATE_H__
#define __HASH_STATE_H__
#include "hash.h"
#include "jenkins_hash.h"
#include "djb2_hash.h"
#include "sdbm_hash.h"
#include "fnv_hash.h"
union __hash_state_t
{
CMPH_HASH hashfunc;
jenkins_state_t jenkins;
djb2_state_t djb2;
sdbm_state_t sdbm;
fnv_state_t fnv;
};
#endif

191
src/jenkins_hash.c Normal file
View File

@@ -0,0 +1,191 @@
#include "jenkins_hash.h"
#include <stdlib.h>
#include <math.h>
#include <limits.h>
#include <string.h>
#include <netinet/in.h>
//#define DEBUG
#include "debug.h"
#define hashsize(n) ((uint32)1<<(n))
#define hashmask(n) (hashsize(n)-1)
//#define NM2 /* Define this if you do not want power of 2 table sizes*/
/*
--------------------------------------------------------------------
mix -- mix 3 32-bit values reversibly.
For every delta with one or two bits set, and the deltas of all three
high bits or all three low bits, whether the original value of a,b,c
is almost all zero or is uniformly distributed,
* If mix() is run forward or backward, at least 32 bits in a,b,c
have at least 1/4 probability of changing.
* If mix() is run forward, every bit of c will change between 1/3 and
2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.)
mix() was built out of 36 single-cycle latency instructions in a
structure that could supported 2x parallelism, like so:
a -= b;
a -= c; x = (c>>13);
b -= c; a ^= x;
b -= a; x = (a<<8);
c -= a; b ^= x;
c -= b; x = (b>>13);
...
Unfortunately, superscalar Pentiums and Sparcs can't take advantage
of that parallelism. They've also turned some of those single-cycle
latency instructions into multi-cycle latency instructions. Still,
this is the fastest good hash I could find. There were about 2^^68
to choose from. I only looked at a billion or so.
--------------------------------------------------------------------
*/
#define mix(a,b,c) \
{ \
a -= b; a -= c; a ^= (c>>13); \
b -= c; b -= a; b ^= (a<<8); \
c -= a; c -= b; c ^= (b>>13); \
a -= b; a -= c; a ^= (c>>12); \
b -= c; b -= a; b ^= (a<<16); \
c -= a; c -= b; c ^= (b>>5); \
a -= b; a -= c; a ^= (c>>3); \
b -= c; b -= a; b ^= (a<<10); \
c -= a; c -= b; c ^= (b>>15); \
}
/*
--------------------------------------------------------------------
hash() -- hash a variable-length key into a 32-bit value
k : the key (the unaligned variable-length array of bytes)
len : the length of the key, counting by bytes
initval : can be any 4-byte value
Returns a 32-bit value. Every bit of the key affects every bit of
the return value. Every 1-bit and 2-bit delta achieves avalanche.
About 6*len+35 instructions.
The best hash table sizes are powers of 2. There is no need to do
mod a prime (mod is sooo slow!). If you need less than 32 bits,
use a bitmask. For example, if you need only 10 bits, do
h = (h & hashmask(10));
In which case, the hash table should have hashsize(10) elements.
If you are hashing n strings (uint8 **)k, do it like this:
for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
By Bob Jenkins, 1996. bob_jenkins@burtleburtle.net. You may use this
code any way you wish, private, educational, or commercial. It's free.
See http://burtleburtle.net/bob/hash/evahash.html
Use for hash table lookup, or anything where one collision in 2^^32 is
acceptable. Do NOT use for cryptographic purposes.
--------------------------------------------------------------------
*/
jenkins_state_t *jenkins_state_new(uint32 size) //size of hash table
{
jenkins_state_t *state = (jenkins_state_t *)malloc(sizeof(jenkins_state_t));
DEBUGP("Initializing jenkins hash\n");
state->seed = rand() % size;
state->nbits = ceil(log(size)/M_LOG2E);
state->size = size;
DEBUGP("Initialized jenkins with size %u, nbits %u and seed %u\n", size, state->nbits, state->seed);
return state;
}
void jenkins_state_destroy(jenkins_state_t *state)
{
free(state);
}
uint32 jenkins_hash(jenkins_state_t *state, const char *k, uint32 keylen)
{
uint32 a, b, c;
uint32 len, length;
/* Set up the internal state */
length = keylen;
len = length;
a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */
c = state->seed; /* the previous hash value - seed in our case */
/*---------------------------------------- handle most of the key */
while (len >= 12)
{
a += (k[0] +((uint32)k[1]<<8) +((uint32)k[2]<<16) +((uint32)k[3]<<24));
b += (k[4] +((uint32)k[5]<<8) +((uint32)k[6]<<16) +((uint32)k[7]<<24));
c += (k[8] +((uint32)k[9]<<8) +((uint32)k[10]<<16)+((uint32)k[11]<<24));
mix(a,b,c);
k += 12; len -= 12;
}
/*------------------------------------- handle the last 11 bytes */
c += length;
switch(len) /* all the case statements fall through */
{
case 11:
c +=((uint32)k[10]<<24);
case 10:
c +=((uint32)k[9]<<16);
case 9 :
c +=((uint32)k[8]<<8);
/* the first byte of c is reserved for the length */
case 8 :
b +=((uint32)k[7]<<24);
case 7 :
b +=((uint32)k[6]<<16);
case 6 :
b +=((uint32)k[5]<<8);
case 5 :
b +=k[4];
case 4 :
a +=((uint32)k[3]<<24);
case 3 :
a +=((uint32)k[2]<<16);
case 2 :
a +=((uint32)k[1]<<8);
case 1 :
a +=k[0];
/* case 0: nothing left to add */
}
mix(a,b,c);
/*-------------------------------------------- report the result */
//c = (c & hashmask(state->size));
//c = (c >= state->size) ? c ^ state->size: c;
//state->last_hash = c; Do not update last_hash because we use a fixed
//seed
return c;
}
void jenkins_state_dump(jenkins_state_t *state, char **buf, uint32 *buflen)
{
uint32 nseed = htonl(state->seed);
uint32 nnbits = htonl(state->nbits);
uint32 nsize = htonl(state->size);
*buflen = sizeof(uint32)*3;
*buf = malloc(*buflen);
if (!*buf)
{
*buflen = UINT_MAX;
return;
}
memcpy(*buf, &nseed, sizeof(uint32));
memcpy(*buf + sizeof(uint32), &nnbits, sizeof(uint32));
memcpy(*buf + sizeof(uint32)*2, &nsize, sizeof(uint32));
DEBUGP("Dumped jenkins state with seed %u\n", state->seed);
return;
}
jenkins_state_t *jenkins_state_load(const char *buf, uint32 buflen)
{
jenkins_state_t *state = (jenkins_state_t *)malloc(sizeof(jenkins_state_t));
state->seed = ntohl(*(uint32 *)buf);
state->nbits = ntohl(*(((uint32 *)buf) + 1));
state->size = ntohl(*(((uint32 *)buf) + 2));
state->hashfunc = HASH_JENKINS;
DEBUGP("Loaded jenkins state with seed %u\n", state->seed);
return state;
}

20
src/jenkins_hash.h Normal file
View File

@@ -0,0 +1,20 @@
#ifndef __JEKINS_HASH_H__
#define __JEKINS_HASH_H__
#include "hash.h"
typedef struct __jenkins_state_t
{
CMPH_HASH hashfunc;
uint32 seed;
uint32 nbits;
uint32 size;
} jenkins_state_t;
jenkins_state_t *jenkins_state_new(uint32 size); //size of hash table
uint32 jenkins_hash(jenkins_state_t *state, const char *k, uint32 keylen);
void jenkins_state_dump(jenkins_state_t *state, char **buf, uint32 *buflen);
jenkins_state_t *jenkins_state_load(const char *buf, uint32 buflen);
void jenkins_state_destroy(jenkins_state_t *state);
#endif

282
src/main.c Normal file
View File

@@ -0,0 +1,282 @@
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <time.h>
#include <limits.h>
#include <assert.h>
#include "cmph.h"
#include "hash.h"
#include "../config.h"
void usage(const char *prg)
{
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-g [-s seed] ] [-m file.mph] [-a algorithm] keysfile\n", prg);
}
void usage_long(const char *prg)
{
uint32 i;
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-g [-s seed] ] [-m file.mph] [-a algorithm] keysfile\n", prg);
fprintf(stderr, "Minimum perfect hashing tool\n\n");
fprintf(stderr, " -h\t print this help message\n");
fprintf(stderr, " -a\t algorithm - valid values are\n");
for (i = 0; i < MPH_COUNT; ++i) fprintf(stderr, " \t * %s\n", mph_names[i]);
fprintf(stderr, " -f\t hash function (may be used multiple times) - valid values are\n");
for (i = 0; i < HASH_COUNT; ++i) fprintf(stderr, " \t * %s\n", hash_names[i]);
fprintf(stderr, " -V\t print version number and exit\n");
fprintf(stderr, " -v\t increase verbosity (may be used multiple times)\n");
fprintf(stderr, " -g\t generation mode\n");
fprintf(stderr, " -s\t random seed\n");
fprintf(stderr, " -m\t minimum perfect hash function file \n");
fprintf(stderr, " keysfile\t line separated file with keys\n");
}
static int key_read(void *data, char **key, uint32 *keylen)
{
FILE *fd = (FILE *)data;
*key = NULL;
*keylen = 0;
while(1)
{
char buf[BUFSIZ];
char *c = fgets(buf, BUFSIZ, fd);
if (c == NULL) return -1;
if (feof(fd)) return -1;
*key = (char *)realloc(*key, *keylen + strlen(buf) + 1);
memcpy(*key + *keylen, buf, strlen(buf));
*keylen += strlen(buf);
if (buf[strlen(buf) - 1] != '\n') continue;
break;
}
if ((*keylen) && (*key)[*keylen - 1] == '\n')
{
(*key)[(*keylen) - 1] = 0;
--(*keylen);
}
return *keylen;
}
static void key_dispose(void *data, char *key, uint32 keylen)
{
free(key);
}
static void key_rewind(void *data)
{
FILE *fd = (FILE *)data;
rewind(fd);
}
static uint32 count_keys(FILE *fd)
{
uint32 count = 0;
rewind(fd);
while(1)
{
char buf[BUFSIZ];
char *c = fgets(buf, BUFSIZ, fd);
if (feof(fd)) break;
if (buf[strlen(buf) - 1] != '\n') continue;
++count;
}
rewind(fd);
return count;
}
int main(int argc, char **argv)
{
char verbosity = 0;
char generate = 0;
char *mphf_file = NULL;
FILE *mphf_fd = stdout;
const char *keys_file = NULL;
FILE *keys_fd;
uint32 seed = UINT_MAX;
CMPH_HASH *hashes = NULL;
uint32 nhashes = 0;
uint32 i;
MPH_ALGO mph_algo = MPH_CZECH;
mph_t *mph = NULL;
mphf_t *mphf = NULL;
key_source_t source;
while (1)
{
char c = getopt(argc, argv, "hVva:f:gm:s:");
if (c == -1) break;
switch (c)
{
case 's':
{
char *cptr;
seed = strtoul(optarg, &cptr, 10);
if(*cptr != 0) {
fprintf(stderr, "Invalid seed %s\n", optarg);
exit(1);
}
}
break;
case 'g':
generate = 1;
break;
case 'm':
mphf_file = strdup(optarg);
break;
case 'v':
++verbosity;
break;
case 'V':
printf("%s\n", VERSION);
return 0;
case 'h':
usage_long(argv[0]);
return 0;
case 'a':
{
char valid = 0;
for (i = 0; i < MPH_COUNT; ++i)
{
if (strcmp(mph_names[i], optarg) == 0)
{
mph_algo = i;
valid = 1;
break;
}
}
if (!valid)
{
fprintf(stderr, "Invalid mph algorithm: %s\n", optarg);
return -1;
}
}
break;
case 'f':
{
char valid = 0;
for (i = 0; i < HASH_COUNT; ++i)
{
if (strcmp(hash_names[i], optarg) == 0)
{
hashes = (CMPH_HASH *)realloc(hashes, sizeof(CMPH_HASH) * ( nhashes + 2 ));
hashes[nhashes] = i;
hashes[nhashes + 1] = HASH_COUNT;
++nhashes;
valid = 1;
break;
}
}
if (!valid)
{
fprintf(stderr, "Invalid hash function: %s\n", optarg);
return -1;
}
}
break;
default:
usage(argv[0]);
return 1;
}
}
if (optind != argc - 1)
{
usage(argv[0]);
return 1;
}
keys_file = argv[optind];
if (seed == UINT_MAX) seed = time(NULL);
srand(seed);
if (mphf_file == NULL)
{
mphf_file = (char *)malloc(strlen(keys_file) + 5);
memcpy(mphf_file, keys_file, strlen(keys_file));
memcpy(mphf_file + strlen(keys_file), ".mph\0", 5);
}
keys_fd = fopen(keys_file, "r");
if (keys_fd == NULL)
{
fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno));
return -1;
}
source.data = (void *)keys_fd;
source.nkeys = count_keys(keys_fd);
source.read = key_read;
source.dispose = key_dispose;
source.rewind = key_rewind;
if (generate)
{
//Create mphf
mph = mph_new(mph_algo, &source);
if (nhashes) mph_set_hashfuncs(mph, hashes);
mph_set_verbosity(mph, verbosity);
mphf = mph_create(mph);
if (mphf == NULL)
{
fprintf(stderr, "Unable to create minimum perfect hashing function\n");
mph_destroy(mph);
free(mphf_file);
return -1;
}
mphf_fd = fopen(mphf_file, "w");
if (mphf_fd == NULL)
{
fprintf(stderr, "Unable to open output file %s: %s\n", mphf_file, strerror(errno));
free(mphf_file);
return -1;
}
mphf_dump(mphf, mphf_fd);
mphf_destroy(mphf);
fclose(mphf_fd);
}
else
{
uint8 * hashtable = NULL;
mphf_fd = fopen(mphf_file, "r");
if (mphf_fd == NULL)
{
fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno));
free(mphf_file);
return -1;
}
mphf = mphf_load(mphf_fd);
fclose(mphf_fd);
if (!mphf)
{
fprintf(stderr, "Unable to parser input file %s\n", mphf_file);
free(mphf_file);
return -1;
}
hashtable = (uint8*)malloc(source.nkeys*sizeof(uint8));
memset(hashtable, 0, source.nkeys);
//check all keys
for (i = 0; i < source.nkeys; ++i)
{
uint32 h;
char *buf;
uint32 buflen = 0;
source.read(source.data, &buf, &buflen);
h = mphf_search(mphf, buf, buflen);
if(hashtable[h])fprintf(stderr, "collision: %u\n",h);
assert(hashtable[h]==0);
hashtable[h] = 1;
if (verbosity)
{
printf("%s -> %u\n", buf, h);
}
source.dispose(source.data, buf, buflen);
}
mphf_destroy(mphf);
free(hashtable);
}
fclose(keys_fd);
free(mphf_file);
return 0;
}

42
src/sdbm_hash.c Normal file
View File

@@ -0,0 +1,42 @@
#include "sdbm_hash.h"
#include <stdlib.h>
sdbm_state_t *sdbm_state_new()
{
sdbm_state_t *state = (sdbm_state_t *)malloc(sizeof(sdbm_state_t));
state->hashfunc = HASH_SDBM;
return state;
}
void sdbm_state_destroy(sdbm_state_t *state)
{
free(state);
}
uint32 sdbm_hash(sdbm_state_t *state, const char *k, uint32 keylen)
{
register unsigned int hash = 0;
const unsigned char *ptr = k;
int i = 0;
while(i < keylen) {
hash = *ptr + (hash << 6) + (hash << 16) - hash;
++ptr, ++i;
}
return hash;
}
void sdbm_state_dump(sdbm_state_t *state, char **buf, uint32 *buflen)
{
*buf = NULL;
*buflen = 0;
return;
}
sdbm_state_t *sdbm_state_load(const char *buf, uint32 buflen)
{
sdbm_state_t *state = (sdbm_state_t *)malloc(sizeof(sdbm_state_t));
state->hashfunc = HASH_SDBM;
return state;
}

17
src/sdbm_hash.h Normal file
View File

@@ -0,0 +1,17 @@
#ifndef __SDBM_HASH_H__
#define __SDBM_HASH_H__
#include "hash.h"
typedef struct __sdbm_state_t
{
CMPH_HASH hashfunc;
} sdbm_state_t;
sdbm_state_t *sdbm_state_new();
uint32 sdbm_hash(sdbm_state_t *state, const char *k, uint32 keylen);
void sdbm_state_dump(sdbm_state_t *state, char **buf, uint32 *buflen);
sdbm_state_t *sdbm_state_load(const char *buf, uint32 buflen);
void sdbm_state_destroy(sdbm_state_t *state);
#endif

49
src/vqueue.c Normal file
View File

@@ -0,0 +1,49 @@
#include "vqueue.h"
#include <stdio.h>
#include <assert.h>
struct __vqueue_t
{
uint32 * values;
uint32 beg, end, capacity;
};
vqueue_t * vqueue_new(uint32 capacity)
{
vqueue_t *q = (vqueue_t *)malloc(sizeof(vqueue_t));
assert(q);
q->values = (uint32 *)calloc(capacity+1, sizeof(uint32));
q->beg = q->end = 0;
q->capacity = capacity+1;
return q;
}
uint8 vqueue_is_empty(vqueue_t * q)
{
return (q->beg == q->end);
}
void vqueue_insert(vqueue_t * q, uint32 val)
{
assert((q->end + 1)%q->capacity != q->beg); // Is queue full?
q->end = (q->end + 1)%q->capacity;
q->values[q->end] = val;
}
uint32 vqueue_remove(vqueue_t * q)
{
assert(!vqueue_is_empty(q)); // Is queue empty?
q->beg = (q->beg + 1)%q->capacity;
return q->values[q->beg];
}
void vqueue_print(vqueue_t * q)
{
uint32 i;
for (i = q->beg; i != q->end; i = (i + 1)%q->capacity)
fprintf(stderr, "%u\n", q->values[(i + 1)%q->capacity]);
}
void vqueue_destroy(vqueue_t *q)
{
free(q->values); q->values = NULL;
}

18
src/vqueue.h Normal file
View File

@@ -0,0 +1,18 @@
#ifndef __CMPH_VQUEUE_H__
#define __CMPH_VQUEUE_H__
#include "cmph_types.h"
typedef struct __vqueue_t vqueue_t;
vqueue_t * vqueue_new(uint32 capacity);
uint8 vqueue_is_empty(vqueue_t * q);
void vqueue_insert(vqueue_t * q, uint32 val);
uint32 vqueue_remove(vqueue_t * q);
void vqueue_print(vqueue_t * q);
void vqueue_destroy(vqueue_t * q);
#endif

79
src/vstack.c Normal file
View File

@@ -0,0 +1,79 @@
#include "vstack.h"
#include <stdlib.h>
#include <assert.h>
//#define DEBUG
#include "debug.h"
struct __vstack_t
{
uint32 pointer;
uint32 *values;
uint32 capacity;
};
vstack_t *vstack_new()
{
vstack_t *stack = (vstack_t *)malloc(sizeof(vstack_t));
assert(stack);
stack->pointer = 0;
stack->values = NULL;
stack->capacity = 0;
return stack;
}
void vstack_destroy(vstack_t *stack)
{
assert(stack);
free(stack->values);
free(stack);
}
void vstack_push(vstack_t *stack, uint32 val)
{
assert(stack);
vstack_reserve(stack, stack->pointer + 1);
stack->values[stack->pointer] = val;
++(stack->pointer);
}
void vstack_pop(vstack_t *stack)
{
assert(stack);
assert(stack->pointer > 0);
--(stack->pointer);
}
uint32 vstack_top(vstack_t *stack)
{
assert(stack);
assert(stack->pointer > 0);
return stack->values[(stack->pointer - 1)];
}
int vstack_empty(vstack_t *stack)
{
assert(stack);
return stack->pointer == 0;
}
uint32 vstack_size(vstack_t *stack)
{
return stack->pointer;
}
void vstack_reserve(vstack_t *stack, uint32 size)
{
assert(stack);
if (stack->capacity < size)
{
uint32 new_capacity = stack->capacity + 1;
DEBUGP("Increasing current capacity %u to %u\n", stack->capacity, size);
while (new_capacity < size)
{
new_capacity *= 2;
}
stack->values = (uint32 *)realloc(stack->values, sizeof(uint32)*new_capacity);
assert(stack->values);
stack->capacity = new_capacity;
DEBUGP("Increased\n");
}
}

18
src/vstack.h Normal file
View File

@@ -0,0 +1,18 @@
#ifndef __CMPH_VSTACK_H__
#define __CMPH_VSTACK_H__
#include "cmph_types.h"
typedef struct __vstack_t vstack_t;
vstack_t *vstack_new();
void vstack_destroy(vstack_t *stack);
void vstack_push(vstack_t *stack, uint32 val);
uint32 vstack_top(vstack_t *stack);
void vstack_pop(vstack_t *stack);
int vstack_empty(vstack_t *stack);
uint32 vstack_size(vstack_t *stack);
void vstack_reserve(vstack_t *stack, uint32 size);
#endif