2005-01-27 15:01:45 +02:00
|
|
|
#ifdef WIN32
|
2008-04-28 04:18:23 +03:00
|
|
|
#include "wingetopt.h"
|
2005-01-27 15:01:45 +02:00
|
|
|
#else
|
|
|
|
#include <getopt.h>
|
|
|
|
#endif
|
2004-12-23 15:16:30 +02:00
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <errno.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <time.h>
|
|
|
|
#include <limits.h>
|
|
|
|
#include <assert.h>
|
|
|
|
#include "cmph.h"
|
|
|
|
#include "hash.h"
|
2005-01-18 14:18:51 +02:00
|
|
|
|
|
|
|
#ifdef WIN32
|
2008-03-26 22:26:48 +02:00
|
|
|
#define VERSION "0.8"
|
2005-01-18 14:18:51 +02:00
|
|
|
#else
|
|
|
|
#include "config.h"
|
|
|
|
#endif
|
|
|
|
|
2004-12-23 15:16:30 +02:00
|
|
|
|
|
|
|
void usage(const char *prg)
|
|
|
|
{
|
2009-03-18 21:40:23 +02:00
|
|
|
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg);
|
2004-12-23 15:16:30 +02:00
|
|
|
}
|
|
|
|
void usage_long(const char *prg)
|
|
|
|
{
|
2005-01-18 23:06:08 +02:00
|
|
|
cmph_uint32 i;
|
2009-03-18 21:40:23 +02:00
|
|
|
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg);
|
2004-12-23 15:16:30 +02:00
|
|
|
fprintf(stderr, "Minimum perfect hashing tool\n\n");
|
|
|
|
fprintf(stderr, " -h\t print this help message\n");
|
2007-02-14 04:14:10 +02:00
|
|
|
fprintf(stderr, " -c\t c value determines:\n");
|
2009-06-13 03:49:26 +03:00
|
|
|
fprintf(stderr, " \t * the number of vertices in the graph for the algorithms BMZ and CHM\n");
|
|
|
|
fprintf(stderr, " \t * the number of bits per key required in the FCH algorithm\n");
|
|
|
|
fprintf(stderr, " \t * the load factor in the CHD_PH algorithm\n");
|
2004-12-23 15:16:30 +02:00
|
|
|
fprintf(stderr, " -a\t algorithm - valid values are\n");
|
2005-01-18 23:06:08 +02:00
|
|
|
for (i = 0; i < CMPH_COUNT; ++i) fprintf(stderr, " \t * %s\n", cmph_names[i]);
|
2004-12-23 15:16:30 +02:00
|
|
|
fprintf(stderr, " -f\t hash function (may be used multiple times) - valid values are\n");
|
2005-01-18 23:06:08 +02:00
|
|
|
for (i = 0; i < CMPH_HASH_COUNT; ++i) fprintf(stderr, " \t * %s\n", cmph_hash_names[i]);
|
2004-12-23 15:16:30 +02:00
|
|
|
fprintf(stderr, " -V\t print version number and exit\n");
|
|
|
|
fprintf(stderr, " -v\t increase verbosity (may be used multiple times)\n");
|
2005-01-05 21:48:23 +02:00
|
|
|
fprintf(stderr, " -k\t number of keys\n");
|
2004-12-23 15:16:30 +02:00
|
|
|
fprintf(stderr, " -g\t generation mode\n");
|
|
|
|
fprintf(stderr, " -s\t random seed\n");
|
|
|
|
fprintf(stderr, " -m\t minimum perfect hash function file \n");
|
2009-06-13 03:49:26 +03:00
|
|
|
fprintf(stderr, " -M\t main memory availability (in MB) used in BRZ algorithm \n");
|
|
|
|
fprintf(stderr, " -d\t temporary directory used in BRZ algorithm \n");
|
|
|
|
fprintf(stderr, " -b\t the meaning of this parameter depends on the algorithm selected in the -a option:\n");
|
|
|
|
fprintf(stderr, " \t * For BRZ it is used to make the maximal number of keys in a bucket lower than 256.\n");
|
|
|
|
fprintf(stderr, " \t In this case its value should be an integer in the range [64,175]. Default is 128.\n\n");
|
|
|
|
fprintf(stderr, " \t * For BDZ it is used to determine the size of some precomputed rank\n");
|
|
|
|
fprintf(stderr, " \t information and its value should be an integer in the range [3,10]. Default\n");
|
|
|
|
fprintf(stderr, " \t is 7. The larger is this value, the more compact are the resulting functions\n");
|
|
|
|
fprintf(stderr, " \t and the slower are them at evaluation time.\n\n");
|
|
|
|
fprintf(stderr, " \t * For CHD and CHD_PH it is used to set the average number of keys per bucket\n");
|
|
|
|
fprintf(stderr, " \t and its value should be an integer in the range [1,32]. Default is 4. The\n");
|
|
|
|
fprintf(stderr, " \t larger is this value, the slower is the construction of the functions.\n");
|
|
|
|
fprintf(stderr, " \t This parameter has no effect for other algorithms.\n\n");
|
|
|
|
fprintf(stderr, " -t\t set the number of keys per bin for a t-perfect hashing function. A t-perfect\n");
|
|
|
|
fprintf(stderr, " \t hash function allows at most t collisions in a given bin. This parameter applies\n");
|
|
|
|
fprintf(stderr, " \t only to the CHD and CHD_PH algorithms. Its value should be an integer in the\n");
|
|
|
|
fprintf(stderr, " \t range [1,128]. Defaul is 1\n");
|
2004-12-23 15:16:30 +02:00
|
|
|
fprintf(stderr, " keysfile\t line separated file with keys\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
int main(int argc, char **argv)
|
|
|
|
{
|
2008-04-12 09:17:21 +03:00
|
|
|
cmph_uint32 verbosity = 0;
|
2004-12-23 15:16:30 +02:00
|
|
|
char generate = 0;
|
|
|
|
char *mphf_file = NULL;
|
|
|
|
FILE *mphf_fd = stdout;
|
|
|
|
const char *keys_file = NULL;
|
|
|
|
FILE *keys_fd;
|
2005-01-18 23:06:08 +02:00
|
|
|
cmph_uint32 nkeys = UINT_MAX;
|
|
|
|
cmph_uint32 seed = UINT_MAX;
|
2004-12-23 15:16:30 +02:00
|
|
|
CMPH_HASH *hashes = NULL;
|
2005-01-18 23:06:08 +02:00
|
|
|
cmph_uint32 nhashes = 0;
|
|
|
|
cmph_uint32 i;
|
2005-01-25 23:06:58 +02:00
|
|
|
CMPH_ALGO mph_algo = CMPH_CHM;
|
2008-04-12 09:17:21 +03:00
|
|
|
double c = 0;
|
2005-01-21 22:42:33 +02:00
|
|
|
cmph_config_t *config = NULL;
|
|
|
|
cmph_t *mphf = NULL;
|
2006-04-27 20:30:19 +03:00
|
|
|
char * tmp_dir = NULL;
|
2005-01-24 22:25:58 +02:00
|
|
|
cmph_io_adapter_t *source;
|
2005-09-06 17:37:35 +03:00
|
|
|
cmph_uint32 memory_availability = 0;
|
2009-03-18 21:40:23 +02:00
|
|
|
cmph_uint32 b = 0;
|
2009-03-19 00:08:46 +02:00
|
|
|
cmph_uint32 keys_per_bin = 1;
|
2004-12-23 15:16:30 +02:00
|
|
|
while (1)
|
|
|
|
{
|
2009-06-12 08:46:18 +03:00
|
|
|
char ch = (char)getopt(argc, argv, "hVvgc:k:a:M:b:t:f:m:d:s:");
|
2005-01-17 19:58:43 +02:00
|
|
|
if (ch == -1) break;
|
|
|
|
switch (ch)
|
2004-12-23 15:16:30 +02:00
|
|
|
{
|
|
|
|
case 's':
|
|
|
|
{
|
|
|
|
char *cptr;
|
2009-06-12 08:46:18 +03:00
|
|
|
seed = (cmph_uint32)strtoul(optarg, &cptr, 10);
|
2004-12-23 15:16:30 +02:00
|
|
|
if(*cptr != 0) {
|
|
|
|
fprintf(stderr, "Invalid seed %s\n", optarg);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
2005-01-17 19:58:43 +02:00
|
|
|
case 'c':
|
|
|
|
{
|
|
|
|
char *endptr;
|
|
|
|
c = strtod(optarg, &endptr);
|
|
|
|
if(*endptr != 0) {
|
|
|
|
fprintf(stderr, "Invalid c value %s\n", optarg);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
2004-12-23 15:16:30 +02:00
|
|
|
case 'g':
|
|
|
|
generate = 1;
|
|
|
|
break;
|
2005-01-05 21:48:23 +02:00
|
|
|
case 'k':
|
|
|
|
{
|
|
|
|
char *endptr;
|
2009-06-12 08:46:18 +03:00
|
|
|
nkeys = (cmph_uint32)strtoul(optarg, &endptr, 10);
|
2005-01-05 21:48:23 +02:00
|
|
|
if(*endptr != 0) {
|
|
|
|
fprintf(stderr, "Invalid number of keys %s\n", optarg);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
2004-12-23 15:16:30 +02:00
|
|
|
case 'm':
|
|
|
|
mphf_file = strdup(optarg);
|
|
|
|
break;
|
2005-08-08 04:00:27 +03:00
|
|
|
case 'd':
|
|
|
|
tmp_dir = strdup(optarg);
|
|
|
|
break;
|
2005-09-06 17:37:35 +03:00
|
|
|
case 'M':
|
|
|
|
{
|
|
|
|
char *cptr;
|
2009-06-12 08:46:18 +03:00
|
|
|
memory_availability = (cmph_uint32)strtoul(optarg, &cptr, 10);
|
2005-09-06 17:37:35 +03:00
|
|
|
if(*cptr != 0) {
|
|
|
|
fprintf(stderr, "Invalid memory availability %s\n", optarg);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
2006-01-25 21:45:14 +02:00
|
|
|
case 'b':
|
|
|
|
{
|
|
|
|
char *cptr;
|
2009-06-12 08:46:18 +03:00
|
|
|
b = (cmph_uint32)strtoul(optarg, &cptr, 10);
|
2006-01-25 21:45:14 +02:00
|
|
|
if(*cptr != 0) {
|
|
|
|
fprintf(stderr, "Parameter b was not found: %s\n", optarg);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
2009-03-18 21:40:23 +02:00
|
|
|
case 't':
|
|
|
|
{
|
|
|
|
char *cptr;
|
2009-06-12 08:46:18 +03:00
|
|
|
keys_per_bin = (cmph_uint32)strtoul(optarg, &cptr, 10);
|
2009-03-18 21:40:23 +02:00
|
|
|
if(*cptr != 0) {
|
|
|
|
fprintf(stderr, "Parameter t was not found: %s\n", optarg);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
2004-12-23 15:16:30 +02:00
|
|
|
case 'v':
|
|
|
|
++verbosity;
|
|
|
|
break;
|
|
|
|
case 'V':
|
|
|
|
printf("%s\n", VERSION);
|
|
|
|
return 0;
|
|
|
|
case 'h':
|
|
|
|
usage_long(argv[0]);
|
|
|
|
return 0;
|
|
|
|
case 'a':
|
|
|
|
{
|
|
|
|
char valid = 0;
|
2005-01-18 23:06:08 +02:00
|
|
|
for (i = 0; i < CMPH_COUNT; ++i)
|
2004-12-23 15:16:30 +02:00
|
|
|
{
|
2005-01-18 23:06:08 +02:00
|
|
|
if (strcmp(cmph_names[i], optarg) == 0)
|
2004-12-23 15:16:30 +02:00
|
|
|
{
|
|
|
|
mph_algo = i;
|
|
|
|
valid = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2005-09-23 23:54:31 +03:00
|
|
|
if (!valid)
|
2004-12-23 15:16:30 +02:00
|
|
|
{
|
2005-09-16 05:53:07 +03:00
|
|
|
fprintf(stderr, "Invalid mph algorithm: %s. It is not available in version %s\n", optarg, VERSION);
|
2004-12-23 15:16:30 +02:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case 'f':
|
|
|
|
{
|
|
|
|
char valid = 0;
|
2005-01-18 23:06:08 +02:00
|
|
|
for (i = 0; i < CMPH_HASH_COUNT; ++i)
|
2004-12-23 15:16:30 +02:00
|
|
|
{
|
2005-01-18 23:06:08 +02:00
|
|
|
if (strcmp(cmph_hash_names[i], optarg) == 0)
|
2004-12-23 15:16:30 +02:00
|
|
|
{
|
|
|
|
hashes = (CMPH_HASH *)realloc(hashes, sizeof(CMPH_HASH) * ( nhashes + 2 ));
|
|
|
|
hashes[nhashes] = i;
|
2005-01-18 23:06:08 +02:00
|
|
|
hashes[nhashes + 1] = CMPH_HASH_COUNT;
|
2004-12-23 15:16:30 +02:00
|
|
|
++nhashes;
|
|
|
|
valid = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!valid)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "Invalid hash function: %s\n", optarg);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
usage(argv[0]);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (optind != argc - 1)
|
|
|
|
{
|
|
|
|
usage(argv[0]);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
keys_file = argv[optind];
|
2006-01-25 21:45:14 +02:00
|
|
|
|
2005-01-18 23:06:08 +02:00
|
|
|
if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
|
2004-12-23 15:16:30 +02:00
|
|
|
srand(seed);
|
2005-09-23 23:57:42 +03:00
|
|
|
int ret = 0;
|
2004-12-23 15:16:30 +02:00
|
|
|
if (mphf_file == NULL)
|
|
|
|
{
|
|
|
|
mphf_file = (char *)malloc(strlen(keys_file) + 5);
|
|
|
|
memcpy(mphf_file, keys_file, strlen(keys_file));
|
2008-04-12 09:17:21 +03:00
|
|
|
memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5);
|
2004-12-23 15:16:30 +02:00
|
|
|
}
|
|
|
|
|
2005-01-18 14:18:51 +02:00
|
|
|
keys_fd = fopen(keys_file, "r");
|
2006-01-25 21:45:14 +02:00
|
|
|
|
2004-12-23 15:16:30 +02:00
|
|
|
if (keys_fd == NULL)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno));
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2005-01-18 23:06:08 +02:00
|
|
|
if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
|
2005-01-24 22:25:58 +02:00
|
|
|
if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd);
|
|
|
|
else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys);
|
2004-12-23 15:16:30 +02:00
|
|
|
if (generate)
|
|
|
|
{
|
|
|
|
//Create mphf
|
2006-01-25 21:45:14 +02:00
|
|
|
mphf_fd = fopen(mphf_file, "w");
|
2005-01-24 22:25:58 +02:00
|
|
|
config = cmph_config_new(source);
|
2005-01-21 22:42:33 +02:00
|
|
|
cmph_config_set_algo(config, mph_algo);
|
|
|
|
if (nhashes) cmph_config_set_hashfuncs(config, hashes);
|
|
|
|
cmph_config_set_verbosity(config, verbosity);
|
2006-04-27 20:30:19 +03:00
|
|
|
cmph_config_set_tmp_dir(config, (cmph_uint8 *) tmp_dir);
|
2006-01-25 21:45:14 +02:00
|
|
|
cmph_config_set_mphf_fd(config, mphf_fd);
|
2005-09-06 17:37:35 +03:00
|
|
|
cmph_config_set_memory_availability(config, memory_availability);
|
2006-01-25 21:45:14 +02:00
|
|
|
cmph_config_set_b(config, b);
|
2009-03-18 21:40:23 +02:00
|
|
|
cmph_config_set_keys_per_bin(config, keys_per_bin);
|
|
|
|
|
2006-08-07 17:44:24 +03:00
|
|
|
//if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15;
|
|
|
|
if(mph_algo == CMPH_BMZ && c >= 2.0) c=1.15;
|
2005-01-21 22:42:33 +02:00
|
|
|
if (c != 0) cmph_config_set_graphsize(config, c);
|
|
|
|
mphf = cmph_new(config);
|
2008-03-29 03:48:15 +02:00
|
|
|
|
2006-01-25 21:45:14 +02:00
|
|
|
cmph_config_destroy(config);
|
2004-12-23 15:16:30 +02:00
|
|
|
if (mphf == NULL)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "Unable to create minimum perfect hashing function\n");
|
2006-01-25 21:45:14 +02:00
|
|
|
//cmph_config_destroy(config);
|
2004-12-23 15:16:30 +02:00
|
|
|
free(mphf_file);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (mphf_fd == NULL)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "Unable to open output file %s: %s\n", mphf_file, strerror(errno));
|
|
|
|
free(mphf_file);
|
|
|
|
return -1;
|
|
|
|
}
|
2006-01-25 21:45:14 +02:00
|
|
|
cmph_dump(mphf, mphf_fd);
|
|
|
|
cmph_destroy(mphf);
|
2004-12-23 15:16:30 +02:00
|
|
|
fclose(mphf_fd);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2005-01-27 15:01:45 +02:00
|
|
|
cmph_uint8 * hashtable = NULL;
|
2004-12-23 15:16:30 +02:00
|
|
|
mphf_fd = fopen(mphf_file, "r");
|
|
|
|
if (mphf_fd == NULL)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno));
|
|
|
|
free(mphf_file);
|
|
|
|
return -1;
|
|
|
|
}
|
2005-01-21 22:42:33 +02:00
|
|
|
mphf = cmph_load(mphf_fd);
|
2004-12-23 15:16:30 +02:00
|
|
|
fclose(mphf_fd);
|
|
|
|
if (!mphf)
|
|
|
|
{
|
|
|
|
fprintf(stderr, "Unable to parser input file %s\n", mphf_file);
|
|
|
|
free(mphf_file);
|
|
|
|
return -1;
|
|
|
|
}
|
2005-09-23 23:57:42 +03:00
|
|
|
cmph_uint32 siz = cmph_size(mphf);
|
2009-03-19 00:08:46 +02:00
|
|
|
hashtable = (cmph_uint8*)calloc(siz, sizeof(cmph_uint8));
|
2008-04-12 09:17:21 +03:00
|
|
|
memset(hashtable, 0,(size_t) siz);
|
2004-12-23 15:16:30 +02:00
|
|
|
//check all keys
|
2005-01-24 22:25:58 +02:00
|
|
|
for (i = 0; i < source->nkeys; ++i)
|
2004-12-23 15:16:30 +02:00
|
|
|
{
|
2005-01-18 23:06:08 +02:00
|
|
|
cmph_uint32 h;
|
2004-12-23 15:16:30 +02:00
|
|
|
char *buf;
|
2005-01-18 23:06:08 +02:00
|
|
|
cmph_uint32 buflen = 0;
|
2005-01-24 22:25:58 +02:00
|
|
|
source->read(source->data, &buf, &buflen);
|
2005-01-21 22:42:33 +02:00
|
|
|
h = cmph_search(mphf, buf, buflen);
|
2005-09-23 23:57:42 +03:00
|
|
|
if (!(h < siz))
|
|
|
|
{
|
|
|
|
fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf);
|
|
|
|
ret = 1;
|
2009-03-19 00:08:46 +02:00
|
|
|
} else if(hashtable[h] >= keys_per_bin)
|
2005-09-23 23:57:42 +03:00
|
|
|
{
|
2009-03-19 00:08:46 +02:00
|
|
|
fprintf(stderr, "More than %u keys were mapped to bin %u\n", keys_per_bin, h);
|
2005-09-23 23:57:42 +03:00
|
|
|
fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf);
|
|
|
|
ret = 1;
|
2009-03-19 00:08:46 +02:00
|
|
|
} else hashtable[h]++;
|
2005-09-23 23:57:42 +03:00
|
|
|
|
2004-12-23 15:16:30 +02:00
|
|
|
if (verbosity)
|
|
|
|
{
|
|
|
|
printf("%s -> %u\n", buf, h);
|
|
|
|
}
|
2005-01-24 22:25:58 +02:00
|
|
|
source->dispose(source->data, buf, buflen);
|
2004-12-23 15:16:30 +02:00
|
|
|
}
|
2008-03-26 22:26:48 +02:00
|
|
|
|
2005-01-21 22:42:33 +02:00
|
|
|
cmph_destroy(mphf);
|
2004-12-23 15:16:30 +02:00
|
|
|
free(hashtable);
|
|
|
|
}
|
|
|
|
fclose(keys_fd);
|
|
|
|
free(mphf_file);
|
2005-08-08 04:00:27 +03:00
|
|
|
free(tmp_dir);
|
2006-01-25 21:45:14 +02:00
|
|
|
cmph_io_nlfile_adapter_destroy(source);
|
2005-09-23 23:57:42 +03:00
|
|
|
return ret;
|
2006-01-25 21:45:14 +02:00
|
|
|
|
2004-12-23 15:16:30 +02:00
|
|
|
}
|