turbonss/vldb/ingles/experimentalresults.tex

\section{Experimental Results}

% We now present some experimental results.
% The same experiments were run with our algorithm and
% the algorithm due to Czech, Havas and Majewski~\cite{chm92}, referred to as
% the CHM algorithm.
% The two algorithms were implemented in the C language and
% are available at \texttt{http://cmph.sf.net}.
% Our data consists
% of a collection of 100 million
% universe resource locations (URLs) collected from the Web.
% The average length of a URL in the collection is 63 bytes.
% All experiments were carried out on
% a computer running the Linux operating system, version 2.6.7,
% with a 2.4 gigahertz processor and
% 4 gigabytes of main memory.
% 
% Table~\ref{tab:characteristics} presents the main characteristics
% of the two algorithms.
% The number of edges in the graph $G=(V,E)$ is~$|S|=n$,
% the number of keys in the input set~$S$.
% The number of vertices of $G$ is equal to $1.15n$ and $2.09n$
% for our algorithm and the CHM algorithm, respectively.
% This measure is related to the amount of space to store the array $g$.
% This improves the space required to store a function in our algorithm to
% $55\%$ of the space required by the CHM algorithm.
% The number of critical edges
% is $\frac{1}{2}|E(G)|$ and 0 for our algorithm and the CHM algorithm,
% respectively.
% Our algorithm generates random graphs that contain cycles with high
% probability and the
% CHM algorithm
% generates
% acyclic random graphs.
% Finally, the CHM algorithm generates order preserving functions
% while our algorithm does not preserve order.
% 
% \vspace{-10pt}
% \begin{table}[htb]
% {\footnotesize 
% \begin{center}
% \begin{tabular}{|c|c|c|c|c|c|c|}
% \hline
%                  & $c$  & $|E(G)|$ & $|V(G)|=|g|$ & $|E(G_\crit)|$ & $G$     & Order preserving \\
% \hline
% Our  algorithm   & 1.15 & $n$      & $cn$         & $0.5|E(G)|$    & cyclic  & no \\
% \hline
% CHM  algorithm   & 2.09 & $n$      & $cn$         & 0              & acyclic & yes \\
% \hline
% \end{tabular}
% \end{center}
% }
% \caption{Main characteristics of the algorithms}
% \vspace{-25pt}
% \label{tab:characteristics}
% \end{table}
% 
% Table~\ref{tab:timeresults} presents time measurements.
% All times are in seconds.
% The table entries are averages over 50 trials.
% The column labelled $N_i$ gives
% the number of iterations to generate the random graph $G$
% in the mapping step of the algorithms.
% The next columns give the running times
% for the mapping plus ordering steps together and the searching
% step for each algorithm.
% The last column gives the percentage gain of our algorithm
% over the CHM algorithm.
% 
% \begin{table*}
% {\footnotesize 
% \begin{center}
% \begin{tabular}{|c|cccc|cccc|c|}
% \hline 
% \raisebox{-0.7em}{$n$} & \multicolumn{4}{c|}{\raisebox{-1mm}{Our algorithm}} &
% \multicolumn{4}{c|}{\raisebox{-1mm}{CHM algorithm}}& \raisebox{-0.2em}{Gain}\\
% \cline{2-5} \cline{6-9} 
%  & \raisebox{-1mm}{$N_i$} &\raisebox{-1mm}{Map+Ord} &
% \raisebox{-1mm}{Search} &\raisebox{-1mm}{Total}  &
% \raisebox{-1mm}{$N_i$} &\raisebox{-1mm}{Map+Ord} &\raisebox{-1mm}{Search} &
% \raisebox{-1mm}{Total} & \raisebox{0.2em}{(\%)}\\
% \hline
% %1,562,500    & 2.28 & 8.54   & 2.37   & 10.91   & 2.70 & 14.56    & 1.57   & 16.13    & 48 \\ %[1mm]
% %3,125,000    & 2.16 & 15.92  & 4.88   & 20.80   & 2.85 & 30.36    & 3.20   & 33.56    & 61 \\ %[1mm]
% 6,250,000    & 2.20 & 33.09  & 10.48  & 43.57   & 2.90 & 62.26    & 6.76   & 69.02    & 58 \\ %[1mm]
% 12,500,000   & 2.00 & 63.26  & 23.04  & 86.30   & 2.60 & 117.99   & 14.94  & 132.92   & 54 \\ %[1mm]
% 25,000,000   & 2.00 & 130.79 & 51.55  & 182.34  & 2.80 & 262.05   & 33.68  & 295.73   & 62 \\ %[1mm]
% %50,000,000   & 2.07 & 273.75 & 114.12 & 387.87  & 2.90 & 577.59   & 73.97  & 651.56   & 68 \\ %[1mm]
% 100,000,000  & 2.07 & 567.47 & 243.13 & 810.60  & 2.80 & 1,131.06 & 157.23 & 1,288.29 & 59 \\ %[1mm]
% \hline
% \end{tabular}
% \end{center}
% \caption{Time measurements
% for our algorithm and the CHM algorithm}
% \vspace{-25pt}
% \label{tab:timeresults}
% }\end{table*}
% 
% \enlargethispage{\baselineskip}
% The mapping step of the new algorithm is faster because
% the expected number of iterations in the mapping step to generate
% $G$ are 2.13 and 2.92 for our algorithm and the CHM algorithm, respectively.
% The graph $G$ generated by our algorithm
% has $1.15n$ vertices, against $2.09n$ for the CHM algorithm.
% These two facts make our algorithm faster in the mapping step.
% The ordering step of our algorithm is approximately equal to
% the time to check if $G$ is acyclic for the CHM algorithm.
% The searching step of the CHM algorithm is faster, but the total
% time of our algorithm is, on average, approximately 58\% faster
% than the CHM algorithm.
% 
% The experimental results fully backs the theoretical results.
% It is important to notice the times for the searching step:
% for both algorithms they are not the dominant times,
% and the experimental results clearly show
% a linear behavior for the searching step.
% 
% We now present a heuristic that reduces the space requirement
% to any given value between $1.15n$ words and $0.93n$ words.
% The heuristic reuses, when possible, the set
% of $x$ values that caused reassignments, just before trying $x+1$
% (see Section~\ref{sec:searching}).
% The lower limit $c=0.93$ was obtained experimentally. 
% We generate $10{,}000$ random graphs for 
% each size $n$ ($n=10^5$, $5 \times 10^5$, $10^6$, $2\times 10^6$). 
% With $c=0.93$ we were always able to generate~$h$, but with $c=0.92$ we never
% succeeded.
% Decreasing the value of $c$ leads to an increase in the number of
% iterations to generate $G$.
% For example, for $c=1$ and $c=0.93$, the analytical expected number
% of iterations are $2.72$ and $3.17$, respectively
% (for $n=12{,}500{,}000$, the number of iterations are 2.78 for $c=1$ and 3.04
% for $c=0.93$).
% Table~\ref{tab:timeresults2} presents the total times to construct a
% function for $n=12{,}500{,}000$, with an increase from $86.31$ seconds
% for $c=1.15$ (see Table~\ref{tab:timeresults}) to
% $101.74$ seconds for $c=1$ and to $102.19$ seconds for $c=0.93$.
% 
% \vspace{-5pt}
% \begin{table*}
% {\footnotesize 
% \begin{center}
% \begin{tabular}{|c|cccc|cccc|}
% \hline 
% \raisebox{-0.7em}{$n$} & \multicolumn{4}{c|}{\raisebox{-1mm}{Our algorithm $c=1.00$}} &
% \multicolumn{4}{c|}{\raisebox{-1mm}{Our algorithm $c=0.93$}} \\
% \cline{2-5} \cline{6-9}
%  & \raisebox{-1mm}{$N_i$} &\raisebox{-1mm}{Map+Ord} &
% \raisebox{-1mm}{Search} &\raisebox{-1mm}{Total}  &
% \raisebox{-1mm}{$N_i$} &\raisebox{-1mm}{Map+Ord} &\raisebox{-1mm}{Search} &
% \raisebox{-1mm}{Total} \\%[0.3mm]
% \hline%\\[-2mm]
% 12,500,000   & 2.78 & 76.68  & 25.06  & 101.74  & 3.04 & 76.39   & 25.80  & 102.19 \\ %[1mm]
% \hline
% \end{tabular}
% \end{center}
% \caption{Time measurements
% for our tuned algorithm with $c=1.00$ and $c=0.93$}
% \vspace{-25pt}
% \label{tab:timeresults2}
% }
% \end{table*}
% 
% We compared our algorithm with the ones proposed by Pagh~\cite{p99} and 
% Dietzfelbinger and Hagerup~\cite{dh01}, respectively. The authors sent to us their 
% source code. In their implementation the set of keys is a set of random integers. 
% We modified our implementation to generate our~$h$ from a set of random 
% integers in order to make a fair comparison. For a set of $10^6$ random integers,
% the times to generate a minimal perfect hash function were $2.7 s$, $4 s$ and $4.5 s$ for
% our algorithm, Pagh's algorithm and Dietzfelbinger and Hagerup's algorithm, respectively.
% Thus, our algorithm was 48\% faster than Pagh's algorithm and 67\% faster than
% Dietzfelbinger and Hagerup's algorithm, on average. This gain was maintained for sets with different
% sizes. 
% Our algorithm needs $kn$ ($k \in [0.93, 1.15]$) words to store 
% the resulting function, while Pagh's algorithm needs $kn$ ($k > 2$) words and 
% Dietzfelbinger and Hagerup's algorithm needs $kn$ ($k \in [1.13, 1.15]$) words. 
% The time to generate the functions is inversely proportional to the value of $k$.
% \enlargethispage{\baselineskip}
added vldb jounal 2005-09-27 18:11:25 +03:00			`\section{Experimental Results}`

			`% We now present some experimental results.`
			`% The same experiments were run with our algorithm and`
			`% the algorithm due to Czech, Havas and Majewski~\cite{chm92}, referred to as`
			`% the CHM algorithm.`
			`% The two algorithms were implemented in the C language and`
			`% are available at \texttt{http://cmph.sf.net}.`
			`% Our data consists`
			`% of a collection of 100 million`
			`% universe resource locations (URLs) collected from the Web.`
			`% The average length of a URL in the collection is 63 bytes.`
			`% All experiments were carried out on`
			`% a computer running the Linux operating system, version 2.6.7,`
			`% with a 2.4 gigahertz processor and`
			`% 4 gigabytes of main memory.`
			`%`
			`% Table~\ref{tab:characteristics} presents the main characteristics`
			`% of the two algorithms.`
			`% The number of edges in the graph $G=(V,E)$ is~$\|S\|=n$,`
			`% the number of keys in the input set~$S$.`
			`% The number of vertices of $G$ is equal to $1.15n$ and $2.09n$`
			`% for our algorithm and the CHM algorithm, respectively.`
			`% This measure is related to the amount of space to store the array $g$.`
			`% This improves the space required to store a function in our algorithm to`
			`% $55\%$ of the space required by the CHM algorithm.`
			`% The number of critical edges`
			`% is $\frac{1}{2}\|E(G)\|$ and 0 for our algorithm and the CHM algorithm,`
			`% respectively.`
			`% Our algorithm generates random graphs that contain cycles with high`
			`% probability and the`
			`% CHM algorithm`
			`% generates`
			`% acyclic random graphs.`
			`% Finally, the CHM algorithm generates order preserving functions`
			`% while our algorithm does not preserve order.`
			`%`
			`% \vspace{-10pt}`
			`% \begin{table}[htb]`
			`% {\footnotesize`
			`% \begin{center}`
			`% \begin{tabular}{\|c\|c\|c\|c\|c\|c\|c\|}`
			`% \hline`
			`% & $c$ & $\|E(G)\|$ & $\|V(G)\|=\|g\|$ & $\|E(G_\crit)\|$ & $G$ & Order preserving \\`
			`% \hline`
			`% Our algorithm & 1.15 & $n$ & $cn$ & $0.5\|E(G)\|$ & cyclic & no \\`
			`% \hline`
			`% CHM algorithm & 2.09 & $n$ & $cn$ & 0 & acyclic & yes \\`
			`% \hline`
			`% \end{tabular}`
			`% \end{center}`
			`% }`
			`% \caption{Main characteristics of the algorithms}`
			`% \vspace{-25pt}`
			`% \label{tab:characteristics}`
			`% \end{table}`
			`%`
			`% Table~\ref{tab:timeresults} presents time measurements.`
			`% All times are in seconds.`
			`% The table entries are averages over 50 trials.`
			`% The column labelled $N_i$ gives`
			`% the number of iterations to generate the random graph $G$`
			`% in the mapping step of the algorithms.`
			`% The next columns give the running times`
			`% for the mapping plus ordering steps together and the searching`
			`% step for each algorithm.`
			`% The last column gives the percentage gain of our algorithm`
			`% over the CHM algorithm.`
			`%`
			`% \begin{table*}`
			`% {\footnotesize`
			`% \begin{center}`
			`% \begin{tabular}{\|c\|cccc\|cccc\|c\|}`
			`% \hline`
			`% \raisebox{-0.7em}{$n$} & \multicolumn{4}{c\|}{\raisebox{-1mm}{Our algorithm}} &`
			`% \multicolumn{4}{c\|}{\raisebox{-1mm}{CHM algorithm}}& \raisebox{-0.2em}{Gain}\\`
			`% \cline{2-5} \cline{6-9}`
			`% & \raisebox{-1mm}{$N_i$} &\raisebox{-1mm}{Map+Ord} &`
			`% \raisebox{-1mm}{Search} &\raisebox{-1mm}{Total} &`
			`% \raisebox{-1mm}{$N_i$} &\raisebox{-1mm}{Map+Ord} &\raisebox{-1mm}{Search} &`
			`% \raisebox{-1mm}{Total} & \raisebox{0.2em}{(\%)}\\`
			`% \hline`
			`% %1,562,500 & 2.28 & 8.54 & 2.37 & 10.91 & 2.70 & 14.56 & 1.57 & 16.13 & 48 \\ %[1mm]`
			`% %3,125,000 & 2.16 & 15.92 & 4.88 & 20.80 & 2.85 & 30.36 & 3.20 & 33.56 & 61 \\ %[1mm]`
			`% 6,250,000 & 2.20 & 33.09 & 10.48 & 43.57 & 2.90 & 62.26 & 6.76 & 69.02 & 58 \\ %[1mm]`
			`% 12,500,000 & 2.00 & 63.26 & 23.04 & 86.30 & 2.60 & 117.99 & 14.94 & 132.92 & 54 \\ %[1mm]`
			`% 25,000,000 & 2.00 & 130.79 & 51.55 & 182.34 & 2.80 & 262.05 & 33.68 & 295.73 & 62 \\ %[1mm]`
			`% %50,000,000 & 2.07 & 273.75 & 114.12 & 387.87 & 2.90 & 577.59 & 73.97 & 651.56 & 68 \\ %[1mm]`
			`% 100,000,000 & 2.07 & 567.47 & 243.13 & 810.60 & 2.80 & 1,131.06 & 157.23 & 1,288.29 & 59 \\ %[1mm]`
			`% \hline`
			`% \end{tabular}`
			`% \end{center}`
			`% \caption{Time measurements`
			`% for our algorithm and the CHM algorithm}`
			`% \vspace{-25pt}`
			`% \label{tab:timeresults}`
			`% }\end{table*}`
			`%`
			`% \enlargethispage{\baselineskip}`
			`% The mapping step of the new algorithm is faster because`
			`% the expected number of iterations in the mapping step to generate`
			`% $G$ are 2.13 and 2.92 for our algorithm and the CHM algorithm, respectively.`
			`% The graph $G$ generated by our algorithm`
			`% has $1.15n$ vertices, against $2.09n$ for the CHM algorithm.`
			`% These two facts make our algorithm faster in the mapping step.`
			`% The ordering step of our algorithm is approximately equal to`
			`% the time to check if $G$ is acyclic for the CHM algorithm.`
			`% The searching step of the CHM algorithm is faster, but the total`
			`% time of our algorithm is, on average, approximately 58\% faster`
			`% than the CHM algorithm.`
			`%`
			`% The experimental results fully backs the theoretical results.`
			`% It is important to notice the times for the searching step:`
			`% for both algorithms they are not the dominant times,`
			`% and the experimental results clearly show`
			`% a linear behavior for the searching step.`
			`%`
			`% We now present a heuristic that reduces the space requirement`
			`% to any given value between $1.15n$ words and $0.93n$ words.`
			`% The heuristic reuses, when possible, the set`
			`% of $x$ values that caused reassignments, just before trying $x+1$`
			`% (see Section~\ref{sec:searching}).`
			`% The lower limit $c=0.93$ was obtained experimentally.`
			`% We generate $10{,}000$ random graphs for`
			`% each size $n$ ($n=10^5$, $5 \times 10^5$, $10^6$, $2\times 10^6$).`
			`% With $c=0.93$ we were always able to generate~$h$, but with $c=0.92$ we never`
			`% succeeded.`
			`% Decreasing the value of $c$ leads to an increase in the number of`
			`% iterations to generate $G$.`
			`% For example, for $c=1$ and $c=0.93$, the analytical expected number`
			`% of iterations are $2.72$ and $3.17$, respectively`
			`% (for $n=12{,}500{,}000$, the number of iterations are 2.78 for $c=1$ and 3.04`
			`% for $c=0.93$).`
			`% Table~\ref{tab:timeresults2} presents the total times to construct a`
			`% function for $n=12{,}500{,}000$, with an increase from $86.31$ seconds`
			`% for $c=1.15$ (see Table~\ref{tab:timeresults}) to`
			`% $101.74$ seconds for $c=1$ and to $102.19$ seconds for $c=0.93$.`
			`%`
			`% \vspace{-5pt}`
			`% \begin{table*}`
			`% {\footnotesize`
			`% \begin{center}`
			`% \begin{tabular}{\|c\|cccc\|cccc\|}`
			`% \hline`
			`% \raisebox{-0.7em}{$n$} & \multicolumn{4}{c\|}{\raisebox{-1mm}{Our algorithm $c=1.00$}} &`
			`% \multicolumn{4}{c\|}{\raisebox{-1mm}{Our algorithm $c=0.93$}} \\`
			`% \cline{2-5} \cline{6-9}`
			`% & \raisebox{-1mm}{$N_i$} &\raisebox{-1mm}{Map+Ord} &`
			`% \raisebox{-1mm}{Search} &\raisebox{-1mm}{Total} &`
			`% \raisebox{-1mm}{$N_i$} &\raisebox{-1mm}{Map+Ord} &\raisebox{-1mm}{Search} &`
			`% \raisebox{-1mm}{Total} \\%[0.3mm]`
			`% \hline%\\[-2mm]`
			`% 12,500,000 & 2.78 & 76.68 & 25.06 & 101.74 & 3.04 & 76.39 & 25.80 & 102.19 \\ %[1mm]`
			`% \hline`
			`% \end{tabular}`
			`% \end{center}`
			`% \caption{Time measurements`
			`% for our tuned algorithm with $c=1.00$ and $c=0.93$}`
			`% \vspace{-25pt}`
			`% \label{tab:timeresults2}`
			`% }`
			`% \end{table*}`
			`%`
			`% We compared our algorithm with the ones proposed by Pagh~\cite{p99} and`
			`% Dietzfelbinger and Hagerup~\cite{dh01}, respectively. The authors sent to us their`
			`% source code. In their implementation the set of keys is a set of random integers.`
			`% We modified our implementation to generate our~$h$ from a set of random`
			`% integers in order to make a fair comparison. For a set of $10^6$ random integers,`
			`% the times to generate a minimal perfect hash function were $2.7 s$, $4 s$ and $4.5 s$ for`
			`% our algorithm, Pagh's algorithm and Dietzfelbinger and Hagerup's algorithm, respectively.`
			`% Thus, our algorithm was 48\% faster than Pagh's algorithm and 67\% faster than`
			`% Dietzfelbinger and Hagerup's algorithm, on average. This gain was maintained for sets with different`
			`% sizes.`
			`% Our algorithm needs $kn$ ($k \in [0.93, 1.15]$) words to store`
			`% the resulting function, while Pagh's algorithm needs $kn$ ($k > 2$) words and`
			`% Dietzfelbinger and Hagerup's algorithm needs $kn$ ($k \in [1.13, 1.15]$) words.`
			`% The time to generate the functions is inversely proportional to the value of $k$.`
			`% \enlargethispage{\baselineskip}`