turbonss/vldb/ingles/experimentalresults.tex

179 lines
7.9 KiB
TeX
Raw Normal View History

2005-09-27 18:11:25 +03:00
\section{Experimental Results}
% We now present some experimental results.
% The same experiments were run with our algorithm and
% the algorithm due to Czech, Havas and Majewski~\cite{chm92}, referred to as
% the CHM algorithm.
% The two algorithms were implemented in the C language and
% are available at \texttt{http://cmph.sf.net}.
% Our data consists
% of a collection of 100 million
% universe resource locations (URLs) collected from the Web.
% The average length of a URL in the collection is 63 bytes.
% All experiments were carried out on
% a computer running the Linux operating system, version 2.6.7,
% with a 2.4 gigahertz processor and
% 4 gigabytes of main memory.
%
% Table~\ref{tab:characteristics} presents the main characteristics
% of the two algorithms.
% The number of edges in the graph $G=(V,E)$ is~$|S|=n$,
% the number of keys in the input set~$S$.
% The number of vertices of $G$ is equal to $1.15n$ and $2.09n$
% for our algorithm and the CHM algorithm, respectively.
% This measure is related to the amount of space to store the array $g$.
% This improves the space required to store a function in our algorithm to
% $55\%$ of the space required by the CHM algorithm.
% The number of critical edges
% is $\frac{1}{2}|E(G)|$ and 0 for our algorithm and the CHM algorithm,
% respectively.
% Our algorithm generates random graphs that contain cycles with high
% probability and the
% CHM algorithm
% generates
% acyclic random graphs.
% Finally, the CHM algorithm generates order preserving functions
% while our algorithm does not preserve order.
%
% \vspace{-10pt}
% \begin{table}[htb]
% {\footnotesize
% \begin{center}
% \begin{tabular}{|c|c|c|c|c|c|c|}
% \hline
% & $c$ & $|E(G)|$ & $|V(G)|=|g|$ & $|E(G_\crit)|$ & $G$ & Order preserving \\
% \hline
% Our algorithm & 1.15 & $n$ & $cn$ & $0.5|E(G)|$ & cyclic & no \\
% \hline
% CHM algorithm & 2.09 & $n$ & $cn$ & 0 & acyclic & yes \\
% \hline
% \end{tabular}
% \end{center}
% }
% \caption{Main characteristics of the algorithms}
% \vspace{-25pt}
% \label{tab:characteristics}
% \end{table}
%
% Table~\ref{tab:timeresults} presents time measurements.
% All times are in seconds.
% The table entries are averages over 50 trials.
% The column labelled $N_i$ gives
% the number of iterations to generate the random graph $G$
% in the mapping step of the algorithms.
% The next columns give the running times
% for the mapping plus ordering steps together and the searching
% step for each algorithm.
% The last column gives the percentage gain of our algorithm
% over the CHM algorithm.
%
% \begin{table*}
% {\footnotesize
% \begin{center}
% \begin{tabular}{|c|cccc|cccc|c|}
% \hline
% \raisebox{-0.7em}{$n$} & \multicolumn{4}{c|}{\raisebox{-1mm}{Our algorithm}} &
% \multicolumn{4}{c|}{\raisebox{-1mm}{CHM algorithm}}& \raisebox{-0.2em}{Gain}\\
% \cline{2-5} \cline{6-9}
% & \raisebox{-1mm}{$N_i$} &\raisebox{-1mm}{Map+Ord} &
% \raisebox{-1mm}{Search} &\raisebox{-1mm}{Total} &
% \raisebox{-1mm}{$N_i$} &\raisebox{-1mm}{Map+Ord} &\raisebox{-1mm}{Search} &
% \raisebox{-1mm}{Total} & \raisebox{0.2em}{(\%)}\\
% \hline
% %1,562,500 & 2.28 & 8.54 & 2.37 & 10.91 & 2.70 & 14.56 & 1.57 & 16.13 & 48 \\ %[1mm]
% %3,125,000 & 2.16 & 15.92 & 4.88 & 20.80 & 2.85 & 30.36 & 3.20 & 33.56 & 61 \\ %[1mm]
% 6,250,000 & 2.20 & 33.09 & 10.48 & 43.57 & 2.90 & 62.26 & 6.76 & 69.02 & 58 \\ %[1mm]
% 12,500,000 & 2.00 & 63.26 & 23.04 & 86.30 & 2.60 & 117.99 & 14.94 & 132.92 & 54 \\ %[1mm]
% 25,000,000 & 2.00 & 130.79 & 51.55 & 182.34 & 2.80 & 262.05 & 33.68 & 295.73 & 62 \\ %[1mm]
% %50,000,000 & 2.07 & 273.75 & 114.12 & 387.87 & 2.90 & 577.59 & 73.97 & 651.56 & 68 \\ %[1mm]
% 100,000,000 & 2.07 & 567.47 & 243.13 & 810.60 & 2.80 & 1,131.06 & 157.23 & 1,288.29 & 59 \\ %[1mm]
% \hline
% \end{tabular}
% \end{center}
% \caption{Time measurements
% for our algorithm and the CHM algorithm}
% \vspace{-25pt}
% \label{tab:timeresults}
% }\end{table*}
%
% \enlargethispage{\baselineskip}
% The mapping step of the new algorithm is faster because
% the expected number of iterations in the mapping step to generate
% $G$ are 2.13 and 2.92 for our algorithm and the CHM algorithm, respectively.
% The graph $G$ generated by our algorithm
% has $1.15n$ vertices, against $2.09n$ for the CHM algorithm.
% These two facts make our algorithm faster in the mapping step.
% The ordering step of our algorithm is approximately equal to
% the time to check if $G$ is acyclic for the CHM algorithm.
% The searching step of the CHM algorithm is faster, but the total
% time of our algorithm is, on average, approximately 58\% faster
% than the CHM algorithm.
%
% The experimental results fully backs the theoretical results.
% It is important to notice the times for the searching step:
% for both algorithms they are not the dominant times,
% and the experimental results clearly show
% a linear behavior for the searching step.
%
% We now present a heuristic that reduces the space requirement
% to any given value between $1.15n$ words and $0.93n$ words.
% The heuristic reuses, when possible, the set
% of $x$ values that caused reassignments, just before trying $x+1$
% (see Section~\ref{sec:searching}).
% The lower limit $c=0.93$ was obtained experimentally.
% We generate $10{,}000$ random graphs for
% each size $n$ ($n=10^5$, $5 \times 10^5$, $10^6$, $2\times 10^6$).
% With $c=0.93$ we were always able to generate~$h$, but with $c=0.92$ we never
% succeeded.
% Decreasing the value of $c$ leads to an increase in the number of
% iterations to generate $G$.
% For example, for $c=1$ and $c=0.93$, the analytical expected number
% of iterations are $2.72$ and $3.17$, respectively
% (for $n=12{,}500{,}000$, the number of iterations are 2.78 for $c=1$ and 3.04
% for $c=0.93$).
% Table~\ref{tab:timeresults2} presents the total times to construct a
% function for $n=12{,}500{,}000$, with an increase from $86.31$ seconds
% for $c=1.15$ (see Table~\ref{tab:timeresults}) to
% $101.74$ seconds for $c=1$ and to $102.19$ seconds for $c=0.93$.
%
% \vspace{-5pt}
% \begin{table*}
% {\footnotesize
% \begin{center}
% \begin{tabular}{|c|cccc|cccc|}
% \hline
% \raisebox{-0.7em}{$n$} & \multicolumn{4}{c|}{\raisebox{-1mm}{Our algorithm $c=1.00$}} &
% \multicolumn{4}{c|}{\raisebox{-1mm}{Our algorithm $c=0.93$}} \\
% \cline{2-5} \cline{6-9}
% & \raisebox{-1mm}{$N_i$} &\raisebox{-1mm}{Map+Ord} &
% \raisebox{-1mm}{Search} &\raisebox{-1mm}{Total} &
% \raisebox{-1mm}{$N_i$} &\raisebox{-1mm}{Map+Ord} &\raisebox{-1mm}{Search} &
% \raisebox{-1mm}{Total} \\%[0.3mm]
% \hline%\\[-2mm]
% 12,500,000 & 2.78 & 76.68 & 25.06 & 101.74 & 3.04 & 76.39 & 25.80 & 102.19 \\ %[1mm]
% \hline
% \end{tabular}
% \end{center}
% \caption{Time measurements
% for our tuned algorithm with $c=1.00$ and $c=0.93$}
% \vspace{-25pt}
% \label{tab:timeresults2}
% }
% \end{table*}
%
% We compared our algorithm with the ones proposed by Pagh~\cite{p99} and
% Dietzfelbinger and Hagerup~\cite{dh01}, respectively. The authors sent to us their
% source code. In their implementation the set of keys is a set of random integers.
% We modified our implementation to generate our~$h$ from a set of random
% integers in order to make a fair comparison. For a set of $10^6$ random integers,
% the times to generate a minimal perfect hash function were $2.7 s$, $4 s$ and $4.5 s$ for
% our algorithm, Pagh's algorithm and Dietzfelbinger and Hagerup's algorithm, respectively.
% Thus, our algorithm was 48\% faster than Pagh's algorithm and 67\% faster than
% Dietzfelbinger and Hagerup's algorithm, on average. This gain was maintained for sets with different
% sizes.
% Our algorithm needs $kn$ ($k \in [0.93, 1.15]$) words to store
% the resulting function, while Pagh's algorithm needs $kn$ ($k > 2$) words and
% Dietzfelbinger and Hagerup's algorithm needs $kn$ ($k \in [1.13, 1.15]$) words.
% The time to generate the functions is inversely proportional to the value of $k$.
% \enlargethispage{\baselineskip}