turbonss/src/main.c

317 lines
7.1 KiB
C
Raw Normal View History

2005-01-20 14:28:42 +02:00
#include "../wingetopt.h"
2004-12-23 15:16:30 +02:00
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <time.h>
#include <limits.h>
#include <assert.h>
#include "cmph.h"
#include "hash.h"
#ifdef WIN32
#define VERSION "0.2"
#else
#include "config.h"
#endif
2004-12-23 15:16:30 +02:00
void usage(const char *prg)
{
2005-01-17 19:58:43 +02:00
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-g [-c value][-s seed] ] [-m file.mph] [-a algorithm] keysfile\n", prg);
2004-12-23 15:16:30 +02:00
}
void usage_long(const char *prg)
{
2005-01-18 23:06:08 +02:00
cmph_uint32 i;
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k] [-g [-s seed] ] [-m file.mph] [-a algorithm] keysfile\n", prg);
2004-12-23 15:16:30 +02:00
fprintf(stderr, "Minimum perfect hashing tool\n\n");
fprintf(stderr, " -h\t print this help message\n");
2005-01-17 19:58:43 +02:00
fprintf(stderr, " -c\t c value that determines the number of vertices in the graph\n");
2004-12-23 15:16:30 +02:00
fprintf(stderr, " -a\t algorithm - valid values are\n");
2005-01-18 23:06:08 +02:00
for (i = 0; i < CMPH_COUNT; ++i) fprintf(stderr, " \t * %s\n", cmph_names[i]);
2004-12-23 15:16:30 +02:00
fprintf(stderr, " -f\t hash function (may be used multiple times) - valid values are\n");
2005-01-18 23:06:08 +02:00
for (i = 0; i < CMPH_HASH_COUNT; ++i) fprintf(stderr, " \t * %s\n", cmph_hash_names[i]);
2004-12-23 15:16:30 +02:00
fprintf(stderr, " -V\t print version number and exit\n");
fprintf(stderr, " -v\t increase verbosity (may be used multiple times)\n");
fprintf(stderr, " -k\t number of keys\n");
2004-12-23 15:16:30 +02:00
fprintf(stderr, " -g\t generation mode\n");
fprintf(stderr, " -s\t random seed\n");
fprintf(stderr, " -m\t minimum perfect hash function file \n");
fprintf(stderr, " keysfile\t line separated file with keys\n");
}
2005-01-18 23:06:08 +02:00
static int key_read(void *data, char **key, cmph_uint32 *keylen)
2004-12-23 15:16:30 +02:00
{
FILE *fd = (FILE *)data;
*key = NULL;
*keylen = 0;
while(1)
{
char buf[BUFSIZ];
char *c = fgets(buf, BUFSIZ, fd);
if (c == NULL) return -1;
if (feof(fd)) return -1;
*key = (char *)realloc(*key, *keylen + strlen(buf) + 1);
memcpy(*key + *keylen, buf, strlen(buf));
2005-01-18 23:06:08 +02:00
*keylen += (cmph_uint32)strlen(buf);
2004-12-23 15:16:30 +02:00
if (buf[strlen(buf) - 1] != '\n') continue;
break;
}
if ((*keylen) && (*key)[*keylen - 1] == '\n')
{
(*key)[(*keylen) - 1] = 0;
--(*keylen);
}
return *keylen;
}
2005-01-18 23:06:08 +02:00
static void key_dispose(void *data, char *key, cmph_uint32 keylen)
2004-12-23 15:16:30 +02:00
{
free(key);
}
static void key_rewind(void *data)
{
FILE *fd = (FILE *)data;
rewind(fd);
}
2005-01-18 23:06:08 +02:00
static cmph_uint32 count_keys(FILE *fd)
2004-12-23 15:16:30 +02:00
{
2005-01-18 23:06:08 +02:00
cmph_uint32 count = 0;
2004-12-23 15:16:30 +02:00
rewind(fd);
while(1)
{
char buf[BUFSIZ];
fgets(buf, BUFSIZ, fd);
2004-12-23 15:16:30 +02:00
if (feof(fd)) break;
if (buf[strlen(buf) - 1] != '\n') continue;
++count;
}
rewind(fd);
return count;
}
int main(int argc, char **argv)
{
char verbosity = 0;
char generate = 0;
char *mphf_file = NULL;
FILE *mphf_fd = stdout;
const char *keys_file = NULL;
FILE *keys_fd;
2005-01-18 23:06:08 +02:00
cmph_uint32 nkeys = UINT_MAX;
cmph_uint32 seed = UINT_MAX;
2004-12-23 15:16:30 +02:00
CMPH_HASH *hashes = NULL;
2005-01-18 23:06:08 +02:00
cmph_uint32 nhashes = 0;
cmph_uint32 i;
CMPH_ALGO mph_algo = CMPH_CZECH;
2005-01-17 19:58:43 +02:00
float c = 2.09;
2005-01-18 23:06:08 +02:00
cmph_mph_t *mph = NULL;
cmph_mphf_t *mphf = NULL;
2004-12-23 15:16:30 +02:00
2005-01-18 23:06:08 +02:00
cmph_key_source_t source;
2004-12-23 15:16:30 +02:00
while (1)
{
2005-01-17 19:58:43 +02:00
char ch = getopt(argc, argv, "hVvgc:k:a:f:m:s:");
if (ch == -1) break;
switch (ch)
2004-12-23 15:16:30 +02:00
{
case 's':
{
char *cptr;
seed = strtoul(optarg, &cptr, 10);
if(*cptr != 0) {
fprintf(stderr, "Invalid seed %s\n", optarg);
exit(1);
}
}
break;
2005-01-17 19:58:43 +02:00
case 'c':
{
char *endptr;
c = strtod(optarg, &endptr);
if(*endptr != 0) {
fprintf(stderr, "Invalid c value %s\n", optarg);
exit(1);
}
}
break;
2004-12-23 15:16:30 +02:00
case 'g':
generate = 1;
break;
case 'k':
{
char *endptr;
nkeys = strtoul(optarg, &endptr, 10);
if(*endptr != 0) {
fprintf(stderr, "Invalid number of keys %s\n", optarg);
exit(1);
}
}
break;
2004-12-23 15:16:30 +02:00
case 'm':
mphf_file = strdup(optarg);
break;
case 'v':
++verbosity;
break;
case 'V':
printf("%s\n", VERSION);
return 0;
case 'h':
usage_long(argv[0]);
return 0;
case 'a':
{
char valid = 0;
2005-01-18 23:06:08 +02:00
for (i = 0; i < CMPH_COUNT; ++i)
2004-12-23 15:16:30 +02:00
{
2005-01-18 23:06:08 +02:00
if (strcmp(cmph_names[i], optarg) == 0)
2004-12-23 15:16:30 +02:00
{
mph_algo = i;
valid = 1;
break;
}
}
if (!valid)
{
fprintf(stderr, "Invalid mph algorithm: %s\n", optarg);
return -1;
}
}
break;
case 'f':
{
char valid = 0;
2005-01-18 23:06:08 +02:00
for (i = 0; i < CMPH_HASH_COUNT; ++i)
2004-12-23 15:16:30 +02:00
{
2005-01-18 23:06:08 +02:00
if (strcmp(cmph_hash_names[i], optarg) == 0)
2004-12-23 15:16:30 +02:00
{
hashes = (CMPH_HASH *)realloc(hashes, sizeof(CMPH_HASH) * ( nhashes + 2 ));
hashes[nhashes] = i;
2005-01-18 23:06:08 +02:00
hashes[nhashes + 1] = CMPH_HASH_COUNT;
2004-12-23 15:16:30 +02:00
++nhashes;
valid = 1;
break;
}
}
if (!valid)
{
fprintf(stderr, "Invalid hash function: %s\n", optarg);
return -1;
}
}
break;
default:
usage(argv[0]);
return 1;
}
}
if (optind != argc - 1)
{
usage(argv[0]);
return 1;
}
keys_file = argv[optind];
2005-01-18 23:06:08 +02:00
if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
2004-12-23 15:16:30 +02:00
srand(seed);
if (mphf_file == NULL)
{
mphf_file = (char *)malloc(strlen(keys_file) + 5);
memcpy(mphf_file, keys_file, strlen(keys_file));
memcpy(mphf_file + strlen(keys_file), ".mph\0", 5);
}
keys_fd = fopen(keys_file, "r");
2004-12-23 15:16:30 +02:00
if (keys_fd == NULL)
{
fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno));
return -1;
}
source.data = (void *)keys_fd;
2005-01-18 23:06:08 +02:00
if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL);
if(nkeys == UINT_MAX) source.nkeys = count_keys(keys_fd);
else source.nkeys = nkeys;
2004-12-23 15:16:30 +02:00
source.read = key_read;
source.dispose = key_dispose;
source.rewind = key_rewind;
if (generate)
{
//Create mphf
2005-01-18 23:06:08 +02:00
mph = cmph_mph_new(mph_algo, &source);
if (nhashes) cmph_mph_set_hashfuncs(mph, hashes);
cmph_mph_set_verbosity(mph, verbosity);
if(mph_algo == CMPH_BMZ && c >= 2.0) c=1.15;
if (c != 0) cmph_mph_set_graphsize(mph, c);
mphf = cmph_mph_create(mph);
2004-12-23 15:16:30 +02:00
if (mphf == NULL)
{
fprintf(stderr, "Unable to create minimum perfect hashing function\n");
2005-01-18 23:06:08 +02:00
cmph_mph_destroy(mph);
2004-12-23 15:16:30 +02:00
free(mphf_file);
return -1;
}
mphf_fd = fopen(mphf_file, "w");
if (mphf_fd == NULL)
{
fprintf(stderr, "Unable to open output file %s: %s\n", mphf_file, strerror(errno));
free(mphf_file);
return -1;
}
2005-01-18 23:06:08 +02:00
cmph_mphf_dump(mphf, mphf_fd);
cmph_mphf_destroy(mphf);
2004-12-23 15:16:30 +02:00
fclose(mphf_fd);
}
else
{
2005-01-18 23:06:08 +02:00
cmph_uint8 * hashtable = NULL;
2004-12-23 15:16:30 +02:00
mphf_fd = fopen(mphf_file, "r");
if (mphf_fd == NULL)
{
fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno));
free(mphf_file);
return -1;
}
2005-01-18 23:06:08 +02:00
mphf = cmph_mphf_load(mphf_fd);
2004-12-23 15:16:30 +02:00
fclose(mphf_fd);
if (!mphf)
{
fprintf(stderr, "Unable to parser input file %s\n", mphf_file);
free(mphf_file);
return -1;
}
2005-01-18 23:06:08 +02:00
hashtable = (cmph_uint8*)malloc(source.nkeys*sizeof(cmph_uint8));
2004-12-23 15:16:30 +02:00
memset(hashtable, 0, source.nkeys);
//check all keys
for (i = 0; i < source.nkeys; ++i)
{
2005-01-18 23:06:08 +02:00
cmph_uint32 h;
2004-12-23 15:16:30 +02:00
char *buf;
2005-01-18 23:06:08 +02:00
cmph_uint32 buflen = 0;
2004-12-23 15:16:30 +02:00
source.read(source.data, &buf, &buflen);
2005-01-18 23:06:08 +02:00
h = cmph_mphf_search(mphf, buf, buflen);
2004-12-23 15:16:30 +02:00
if(hashtable[h])fprintf(stderr, "collision: %u\n",h);
assert(hashtable[h]==0);
hashtable[h] = 1;
if (verbosity)
{
printf("%s -> %u\n", buf, h);
}
source.dispose(source.data, buf, buflen);
}
2005-01-18 23:06:08 +02:00
cmph_mphf_destroy(mphf);
2004-12-23 15:16:30 +02:00
free(hashtable);
}
fclose(keys_fd);
free(mphf_file);
return 0;
}