From 6a18486edd89799e48022272106d925d96b92c41 Mon Sep 17 00:00:00 2001
From: fc_botelho <fc_botelho>
Date: Wed, 14 Feb 2007 02:14:10 +0000
Subject: [PATCH] FCH algorithm documentation was added

---
 CHM.t2t    |  4 ++--
 FCH.t2t    | 45 +++++++++++++++++++++++++++++++++++++++++++++
 README.t2t | 12 ++++++++----
 gendocs    |  2 ++
 src/main.c |  4 +++-
 5 files changed, 60 insertions(+), 7 deletions(-)
 create mode 100644 FCH.t2t

diff --git a/CHM.t2t b/CHM.t2t
index 712723e..d696d38 100644
--- a/CHM.t2t
+++ b/CHM.t2t
@@ -6,7 +6,7 @@ CHM Algorithm
 ----------------------------------------
 
 ==The Algorithm==
-
+The algorithm is presented in [[1,2,3 #papers]].
 ----------------------------------------
 
 ==Memory Consumption==
@@ -70,7 +70,7 @@ Again we have:
 
 ----------------------------------------
   
-==Papers==
+==Papers==[papers]
 
 + Z.J. Czech, G. Havas, and B.S. Majewski. [An optimal algorithm for generating minimal perfect hash functions. papers/chm92.pdf], Information Processing Letters, 43(5):257-264, 1992.
 
diff --git a/FCH.t2t b/FCH.t2t
new file mode 100644
index 0000000..73acfa5
--- /dev/null
+++ b/FCH.t2t
@@ -0,0 +1,45 @@
+FCH Algorithm
+
+
+%!includeconf: CONFIG.t2t
+
+----------------------------------------
+
+==The Algorithm==
+The algorithm is presented in [[1 #papers]].
+----------------------------------------
+
+==Memory Consumption==
+
+Now we detail the memory consumption to generate and to store minimal perfect hash functions
+using the FCH algorithm. The structures responsible for memory consumption are in the 
+following:
+- A vector containing all the //n// keys.
+- Data structure to speed up the searching step:
+  + **random_table**: is a vector used to remember currently empty slots in the hash table. It stores //n// 4 byte long integer numbers. This vector initially contains a random permutation of the //n// hash addresses. A pointer called filled_count is used to keep the invariant that any slots to the right side of filled_count (inclusive) are empty and any ones to the left are filled.
+  + **hash_table**: Table used to check whether all the collisions were resolved. It has //n// entries of one byte.
+  + **map_table**: For any unfilled slot //x// in hash_table, the map_table vector contains //n// 4 byte long pointers pointing at random_table such that random_table[map_table[x]] = x. Thus, given an empty slot x in the hash_table, we can locate its position in the random_table vector through map_table.
+    
+- Other auxiliary structures    
+  + **sorted_indexes**: is a vector of //cn/(log(n) + 1)// 4 byte long pointers to indirectly keep the buckets sorted by decreasing order of their sizes. 
+      
+  + **function //g//**: is represented by a vector of //cn/(log(n) + 1)// 4 byte long integer numbers, one for each bucket. It is used to spread all the keys in a given bucket into the hash table without collisions.
+
+    
+Thus, the total memory consumption of FCH algorithm for generating a minimal 
+perfect hash function (MPHF) is: //O(n) + 9n + 8cn/(log(n) + 1)// bytes.
+The value of parameter //c// must be greater than or equal to 2.6.
+  
+Now we present the memory consumption to store the resulting function.
+We only need to store the //g// function and a constant number of bytes for the seed of the hash functions used in the resulting MPHF. Thus, we need //cn/(log(n) + 1) + O(1)// bytes.
+
+----------------------------------------
+  
+==Papers==[papers]
+
++ E.A. Fox, Q.F. Chen, and L.S. Heath. [A faster algorithm for constructing minimal perfect hash functions. papers/fch92.pdf] In Proc. 15th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, pages 266-273, 1992.
+
+
+%!include: ALGORITHMS.t2t
+
+%!include: FOOTER.t2t
diff --git a/README.t2t b/README.t2t
index defd85d..b27bd4d 100644
--- a/README.t2t
+++ b/README.t2t
@@ -176,16 +176,21 @@ utility.
 
 
 ```
- usage: cmph [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b BRZ_parameter] [-d tmp_dir] [-m file.mph] keysfile
- Minimum perfect hashing tool
+usage: cmph [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c value][-s seed] ] 
+            [-a algorithm] [-M memory_in_MB] [-b BRZ_parameter] [-d tmp_dir] 
+            [-m file.mph] keysfile
+Minimum perfect hashing tool
 
   -h     print this help message
-  -c     c value that determines the number of vertices in the graph
+  -c     c value determines:
+           the number of vertices in the graph for the algorithms BMZ and CHM
+           the number of bits per key required in the FCH algorithm
   -a     algorithm - valid values are
           * bmz
           * bmz8
           * chm
           * brz
+          * fch
   -f     hash function (may be used multiple times) - valid values are
           * djb2
           * fnv
@@ -201,7 +206,6 @@ utility.
   -d     temporary directory used in brz algorithm
   -b     parmeter of BRZ algorithm to make the maximal number of keys in a bucket lower than 256
   keysfile       line separated file with keys
-
 ```
 
 ==Additional Documentation==
diff --git a/gendocs b/gendocs
index b732b3e..05be819 100755
--- a/gendocs
+++ b/gendocs
@@ -2,6 +2,7 @@ txt2tags -t html --mask-email -i README.t2t -o index.html
 txt2tags -t html -i BMZ.t2t -o bmz.html
 txt2tags -t html -i BRZ.t2t -o brz.html
 txt2tags -t html -i CHM.t2t -o chm.html
+txt2tags -t html -i FCH.t2t -o fch.html
 txt2tags -t html -i COMPARISON.t2t -o comparison.html
 txt2tags -t html -i GPERF.t2t -o gperf.html
 txt2tags -t html -i FAQ.t2t -o faq.html
@@ -12,6 +13,7 @@ txt2tags -t txt --mask-email -i README.t2t -o README
 txt2tags -t txt -i BMZ.t2t -o BMZ
 txt2tags -t txt -i BRZ.t2t -o BRZ
 txt2tags -t txt -i CHM.t2t -o CHM
+txt2tags -t txt -i FCH.t2t -o FCH
 txt2tags -t txt -i COMPARISON.t2t -o COMPARISON
 txt2tags -t txt -i GPERF.t2t -o GPERF
 txt2tags -t txt -i FAQ.t2t -o FAQ
diff --git a/src/main.c b/src/main.c
index ee3c954..fac05cd 100644
--- a/src/main.c
+++ b/src/main.c
@@ -30,7 +30,9 @@ void usage_long(const char *prg)
 	fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b BRZ_parameter] [-d tmp_dir] [-m file.mph] keysfile\n", prg);   
 	fprintf(stderr, "Minimum perfect hashing tool\n\n"); 
 	fprintf(stderr, "  -h\t print this help message\n");
-	fprintf(stderr, "  -c\t c value that determines the number of vertices in the graph\n");
+	fprintf(stderr, "  -c\t c value determines:\n");
+	fprintf(stderr, "    \t   the number of vertices in the graph for the algorithms BMZ and CHM\n");
+	fprintf(stderr, "    \t   the number of bits per key required in the FCH algorithm\n");
 	fprintf(stderr, "  -a\t algorithm - valid values are\n");
 	for (i = 0; i < CMPH_COUNT; ++i) fprintf(stderr, "    \t  * %s\n", cmph_names[i]);
 	fprintf(stderr, "  -f\t hash function (may be used multiple times) - valid values are\n");