1 files changed, 46 insertions, 14 deletions
diff --git a/eval.tex b/eval.tex
index 1431cd5..cfd76ab 100644
--- a/eval.tex
+++ b/eval.tex
@@ -1,29 +1,61 @@
 \section{Experiments}
-TODO: prose
+To evaluate our approach, we perform experiments with our implementation
+\psychic{} on C meta programs to generate decision trees of patches.
+
+\subsection{Experiment setup}
+Experiments are run on a machine with AMD Ryzen 3700X at \SI{3.6}{\giga\hertz}
+and \SI{16}{\giga\byte} of memory.  Software dependencies and tooling
+are provided by a declarative Nix environment~\cite{nix} for reproducibility.
 
-\subsection{Experimental setup}
 We evaluate \psychic{} on patches generated by MSV~\cite{msv}, a fork
-of the pattern-based \gls{apr} tool \textsc{Prophet}~\cite{genprog}
+of the pattern-based \gls{apr} tool \textsc{Prophet}~\cite{prophet}
 that adds extra repair templates and generate meta programs.
-MSV is used on \textsc{IntroClass}, a set of attempts for solving
+Patches are generated for \textsc{IntroClass}, a set of attempts for solving
 homework problems~\cite{introclass}.
 
+In addition to the test cases provided by \textsc{IntroClass},
+which are non-exhaustive of the input domains, we use
+property tests~\cite{hypothesis} with the respective
+ground truth program\footnote{\url{https://github.com/McSinyx/IntroClass}}
+to select buggy programs.  MSV then generate fixes in form of meta programs,
+which are then post-processed into \psychic{}-compatible format.
+
+A selection of 10 meta programs with at least two semantically different
+revisions (including the original buggy version) are then fed to \psychic{}
+to generate differential tests and decision trees with a timeout of 10 minutes.
+
 \subsection{Results}
+Experiment results on \textsc{IntroClass} is shown in table~\ref{result}.
+In case of \texttt{digits-1b31@2} and \texttt{smallest-} \texttt{d25c@1},
+\psychic{} failed to generate differential tests.  In other cases,
+except for the timeout in \texttt{digits-} \texttt{0cdf@3},
+the tool successfully differentiate all semantically different patches.
+
 \begin{table}[h]
   \begin{center}
-    \begin{tabular}{l c c c c c r}
+    \begin{tabular}{l c l r r r r}
       \toprule
-      Task & Attempt & Bug & \texttt{\#} revisions & Tree height & Largest cluster & Time\\
+      Task & Attempt & Bug type & $n$ & Tree height & Largest cluster & Time\\
       \midrule
-      \texttt{checksum} & \texttt{3b@3} & & 2 & 1 & 1 & 1.2\\
-      \texttt{checksum} & \texttt{cb@3} & & 3 & 1 & 2 & 1.7\\
-      \texttt{grade} & \texttt{1b@3} & UB & 2 & 1 & 1 & 0.8\\
-      \texttt{grade} & \texttt{31@3} & Logic & 7 & 2 & 5 & 1.0\\
-      \texttt{grade} & \texttt{b1@3} & UB & 2 & 1 & 1 & 0.5\\
-      \texttt{smallest} & \texttt{07@2} & UB & 9 & 1 & 3 & 0.7\\
-      \texttt{smallest} & \texttt{d2@1} & UB & 5 & 0 & 5 & 1.5\\
+      \texttt{checksum} & \texttt{3b23@3} & Arithmetic & 2 & 1 & 1 & 1.2\\
+      \texttt{checksum} & \texttt{cb24@3} & Logic & 3 & 1 & 2 & 1.7\\
+      \texttt{digits} & \texttt{0cdf@3} & Logic & 16 & 1 & 5 & 600.0\\
+      \texttt{digits} & \texttt{1b31@2} & Logic & 5 & 0 & 5 & 0.9\\
+      \texttt{grade} & \texttt{1b31@3} & UB & 2 & 1 & 1 & 0.8\\
+      \texttt{grade} & \texttt{317a@3} & Logic & 8 & 2 & 5 & 1.0\\
+      \texttt{grade} & \texttt{b192@3} & UB & 2 & 1 & 1 & 0.5\\
+      \texttt{median} & \texttt{97f6@3} & UB & 7 & 1 & 6 & 1.6\\
+      \texttt{smallest} & \texttt{0704@2} & UB & 5 & 1 & 3 & 87.0\\
+      \texttt{smallest} & \texttt{d25c@1} & UB & 5 & 0 & 5 & 1.3\\
       \bottomrule
     \end{tabular}
   \end{center}
-  \caption{Decision tree quality on patch pool generated by MSV for IntroClass programs}
+  \caption{Decision tree quality on patch pool generated by MSV
+    for \textsc{IntroClass} programs.  Respective to each meta program
+    with $n$ revisions are the height of the patch decision tree
+    and the maximum number of undistinguished revisions.}\label{result}
 \end{table}
+
+The stark difference in execution time reveals an inherent weakness
+of our approach: heavily-nested conditions or loops increase
+the search space dramatically.