FCH algorithm documentation was added
This commit is contained in:
parent
beeea04351
commit
60c686a2fc
4
CHM.t2t
4
CHM.t2t
|
@ -6,7 +6,7 @@ CHM Algorithm
|
||||||
----------------------------------------
|
----------------------------------------
|
||||||
|
|
||||||
==The Algorithm==
|
==The Algorithm==
|
||||||
|
The algorithm is presented in [[1,2,3 #papers]].
|
||||||
----------------------------------------
|
----------------------------------------
|
||||||
|
|
||||||
==Memory Consumption==
|
==Memory Consumption==
|
||||||
|
@ -70,7 +70,7 @@ Again we have:
|
||||||
|
|
||||||
----------------------------------------
|
----------------------------------------
|
||||||
|
|
||||||
==Papers==
|
==Papers==[papers]
|
||||||
|
|
||||||
+ Z.J. Czech, G. Havas, and B.S. Majewski. [An optimal algorithm for generating minimal perfect hash functions. papers/chm92.pdf], Information Processing Letters, 43(5):257-264, 1992.
|
+ Z.J. Czech, G. Havas, and B.S. Majewski. [An optimal algorithm for generating minimal perfect hash functions. papers/chm92.pdf], Information Processing Letters, 43(5):257-264, 1992.
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,45 @@
|
||||||
|
FCH Algorithm
|
||||||
|
|
||||||
|
|
||||||
|
%!includeconf: CONFIG.t2t
|
||||||
|
|
||||||
|
----------------------------------------
|
||||||
|
|
||||||
|
==The Algorithm==
|
||||||
|
The algorithm is presented in [[1 #papers]].
|
||||||
|
----------------------------------------
|
||||||
|
|
||||||
|
==Memory Consumption==
|
||||||
|
|
||||||
|
Now we detail the memory consumption to generate and to store minimal perfect hash functions
|
||||||
|
using the FCH algorithm. The structures responsible for memory consumption are in the
|
||||||
|
following:
|
||||||
|
- A vector containing all the //n// keys.
|
||||||
|
- Data structure to speed up the searching step:
|
||||||
|
+ **random_table**: is a vector used to remember currently empty slots in the hash table. It stores //n// 4 byte long integer numbers. This vector initially contains a random permutation of the //n// hash addresses. A pointer called filled_count is used to keep the invariant that any slots to the right side of filled_count (inclusive) are empty and any ones to the left are filled.
|
||||||
|
+ **hash_table**: Table used to check whether all the collisions were resolved. It has //n// entries of one byte.
|
||||||
|
+ **map_table**: For any unfilled slot //x// in hash_table, the map_table vector contains //n// 4 byte long pointers pointing at random_table such that random_table[map_table[x]] = x. Thus, given an empty slot x in the hash_table, we can locate its position in the random_table vector through map_table.
|
||||||
|
|
||||||
|
- Other auxiliary structures
|
||||||
|
+ **sorted_indexes**: is a vector of //cn/(log(n) + 1)// 4 byte long pointers to indirectly keep the buckets sorted by decreasing order of their sizes.
|
||||||
|
|
||||||
|
+ **function //g//**: is represented by a vector of //cn/(log(n) + 1)// 4 byte long integer numbers, one for each bucket. It is used to spread all the keys in a given bucket into the hash table without collisions.
|
||||||
|
|
||||||
|
|
||||||
|
Thus, the total memory consumption of FCH algorithm for generating a minimal
|
||||||
|
perfect hash function (MPHF) is: //O(n) + 9n + 8cn/(log(n) + 1)// bytes.
|
||||||
|
The value of parameter //c// must be greater than or equal to 2.6.
|
||||||
|
|
||||||
|
Now we present the memory consumption to store the resulting function.
|
||||||
|
We only need to store the //g// function and a constant number of bytes for the seed of the hash functions used in the resulting MPHF. Thus, we need //cn/(log(n) + 1) + O(1)// bytes.
|
||||||
|
|
||||||
|
----------------------------------------
|
||||||
|
|
||||||
|
==Papers==[papers]
|
||||||
|
|
||||||
|
+ E.A. Fox, Q.F. Chen, and L.S. Heath. [A faster algorithm for constructing minimal perfect hash functions. papers/fch92.pdf] In Proc. 15th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, pages 266-273, 1992.
|
||||||
|
|
||||||
|
|
||||||
|
%!include: ALGORITHMS.t2t
|
||||||
|
|
||||||
|
%!include: FOOTER.t2t
|
12
README.t2t
12
README.t2t
|
@ -176,16 +176,21 @@ utility.
|
||||||
|
|
||||||
|
|
||||||
```
|
```
|
||||||
usage: cmph [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b BRZ_parameter] [-d tmp_dir] [-m file.mph] keysfile
|
usage: cmph [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c value][-s seed] ]
|
||||||
Minimum perfect hashing tool
|
[-a algorithm] [-M memory_in_MB] [-b BRZ_parameter] [-d tmp_dir]
|
||||||
|
[-m file.mph] keysfile
|
||||||
|
Minimum perfect hashing tool
|
||||||
|
|
||||||
-h print this help message
|
-h print this help message
|
||||||
-c c value that determines the number of vertices in the graph
|
-c c value determines:
|
||||||
|
the number of vertices in the graph for the algorithms BMZ and CHM
|
||||||
|
the number of bits per key required in the FCH algorithm
|
||||||
-a algorithm - valid values are
|
-a algorithm - valid values are
|
||||||
* bmz
|
* bmz
|
||||||
* bmz8
|
* bmz8
|
||||||
* chm
|
* chm
|
||||||
* brz
|
* brz
|
||||||
|
* fch
|
||||||
-f hash function (may be used multiple times) - valid values are
|
-f hash function (may be used multiple times) - valid values are
|
||||||
* djb2
|
* djb2
|
||||||
* fnv
|
* fnv
|
||||||
|
@ -201,7 +206,6 @@ utility.
|
||||||
-d temporary directory used in brz algorithm
|
-d temporary directory used in brz algorithm
|
||||||
-b parmeter of BRZ algorithm to make the maximal number of keys in a bucket lower than 256
|
-b parmeter of BRZ algorithm to make the maximal number of keys in a bucket lower than 256
|
||||||
keysfile line separated file with keys
|
keysfile line separated file with keys
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
==Additional Documentation==
|
==Additional Documentation==
|
||||||
|
|
2
gendocs
2
gendocs
|
@ -2,6 +2,7 @@ txt2tags -t html --mask-email -i README.t2t -o index.html
|
||||||
txt2tags -t html -i BMZ.t2t -o bmz.html
|
txt2tags -t html -i BMZ.t2t -o bmz.html
|
||||||
txt2tags -t html -i BRZ.t2t -o brz.html
|
txt2tags -t html -i BRZ.t2t -o brz.html
|
||||||
txt2tags -t html -i CHM.t2t -o chm.html
|
txt2tags -t html -i CHM.t2t -o chm.html
|
||||||
|
txt2tags -t html -i FCH.t2t -o fch.html
|
||||||
txt2tags -t html -i COMPARISON.t2t -o comparison.html
|
txt2tags -t html -i COMPARISON.t2t -o comparison.html
|
||||||
txt2tags -t html -i GPERF.t2t -o gperf.html
|
txt2tags -t html -i GPERF.t2t -o gperf.html
|
||||||
txt2tags -t html -i FAQ.t2t -o faq.html
|
txt2tags -t html -i FAQ.t2t -o faq.html
|
||||||
|
@ -12,6 +13,7 @@ txt2tags -t txt --mask-email -i README.t2t -o README
|
||||||
txt2tags -t txt -i BMZ.t2t -o BMZ
|
txt2tags -t txt -i BMZ.t2t -o BMZ
|
||||||
txt2tags -t txt -i BRZ.t2t -o BRZ
|
txt2tags -t txt -i BRZ.t2t -o BRZ
|
||||||
txt2tags -t txt -i CHM.t2t -o CHM
|
txt2tags -t txt -i CHM.t2t -o CHM
|
||||||
|
txt2tags -t txt -i FCH.t2t -o FCH
|
||||||
txt2tags -t txt -i COMPARISON.t2t -o COMPARISON
|
txt2tags -t txt -i COMPARISON.t2t -o COMPARISON
|
||||||
txt2tags -t txt -i GPERF.t2t -o GPERF
|
txt2tags -t txt -i GPERF.t2t -o GPERF
|
||||||
txt2tags -t txt -i FAQ.t2t -o FAQ
|
txt2tags -t txt -i FAQ.t2t -o FAQ
|
||||||
|
|
|
@ -30,7 +30,9 @@ void usage_long(const char *prg)
|
||||||
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b BRZ_parameter] [-d tmp_dir] [-m file.mph] keysfile\n", prg);
|
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b BRZ_parameter] [-d tmp_dir] [-m file.mph] keysfile\n", prg);
|
||||||
fprintf(stderr, "Minimum perfect hashing tool\n\n");
|
fprintf(stderr, "Minimum perfect hashing tool\n\n");
|
||||||
fprintf(stderr, " -h\t print this help message\n");
|
fprintf(stderr, " -h\t print this help message\n");
|
||||||
fprintf(stderr, " -c\t c value that determines the number of vertices in the graph\n");
|
fprintf(stderr, " -c\t c value determines:\n");
|
||||||
|
fprintf(stderr, " \t the number of vertices in the graph for the algorithms BMZ and CHM\n");
|
||||||
|
fprintf(stderr, " \t the number of bits per key required in the FCH algorithm\n");
|
||||||
fprintf(stderr, " -a\t algorithm - valid values are\n");
|
fprintf(stderr, " -a\t algorithm - valid values are\n");
|
||||||
for (i = 0; i < CMPH_COUNT; ++i) fprintf(stderr, " \t * %s\n", cmph_names[i]);
|
for (i = 0; i < CMPH_COUNT; ++i) fprintf(stderr, " \t * %s\n", cmph_names[i]);
|
||||||
fprintf(stderr, " -f\t hash function (may be used multiple times) - valid values are\n");
|
fprintf(stderr, " -f\t hash function (may be used multiple times) - valid values are\n");
|
||||||
|
|
Loading…
Reference in New Issue