technique.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354

\section{Technique}
In order to generate test cases to distinguish between multiple patches,
our tool extends symbolic execution in a number of ways.  First, instead
of taking in a single program, all of its revisions are passed to the tool.
Since each patch only modifies a small portion of the program, most of the code
is shared among these revisions, hence they are combined into
a \emph{meta program} to avoid repeating execution.

Second, we combine the path constraints in execution \glspl{state}
of different revisions in a certain way before feeding to a \gls{smt} solver
to generate \glspl{difftest}.  Third, the new conditions the combined
\gls{smt} formula includes (aside from the conjunction of its parents)
are assertions on the distinction of corresponding values across revisions,
so we offer multiple options to capture such \emph{\glspl{symout}}.

Next, we introduces concrete execution to reduce the theorem proving load,
and produce \glspl{difftest} for clustering the patches and finally produce
a decision tree.  One may observe that the clustering information
can help with selecting more ``promising'' states, although such scheduling
enhancement is not covered by this work.

We implemented these extensions on top of \klee{} in a tool
named \psychic{}, whose high-level architectures are compared
in figure~\ref{arch}.  The overall process of \psychic{}
is denoted in algorithm~\ref{overall}.

\begin{figure}[h]
  \begin{subcaptiongroup}
    \subcaptionlistentry{KLEE}\label{klee-arch}
    \begin{tikzpicture}[mymatrix/.style={matrix of nodes,
                                         nodes=typetag,row sep=1ex},
                        typetag/.style={draw,anchor=west},
                        title/.style={draw=none,inner sep=0pt},
                        line width=0.2ex]
      \linespread{0.8}
      \node[draw,label={[name=sched]Scheduler}] (states) {States};
      \node[draw,fit={(sched) (states)}] {};

      \node[draw,right=11em of states] (symb) {Interpreter};
      \draw[-angle 45] (states.north east) to[in=90,out=15]
        node[sloped,above,pos=0.4]{Selected state} (symb.north west);
      \draw[-angle 45] (symb.south west) to[in=0,out=-90]
        node[sloped,above,pos=0.6]{Child state} (states.east);

      \node[draw,below=3em of symb] (smt) {Theorem prover};
      \draw[angle 45-angle 45] (symb) to (smt);

      \node[left=4em of states.west] (bc) {Bitcode};
      \draw[-angle 90] (bc) to[in=180,out=60] (states.west);
      \node[above=1em of bc,align=center] (seed) {Seed\\input};
      \draw[dotted] (seed) to[out=0,in=180] (states);
      \node[below=1em of bc,align=center] (tests) {Test\\cases};
      \draw[-angle 45] (smt.west) to[in=0,out=180] (tests);
      \node[below=5em of sched.south east] (symb) {\captiontext{}};
    \end{tikzpicture}
    \vspace{-2em}
    \subcaptionlistentry{Psychic}\label{psychic-arch}
    \begin{tikzpicture}[mymatrix/.style={matrix of nodes,
                                         nodes=typetag,row sep=1ex},
                        typetag/.style={draw,anchor=west},
                        title/.style={draw=none,inner sep=0pt},
                        line width=0.2ex]
      \linespread{0.8}
      \node[draw,label={[name=sched]Scheduler}] (states) {States};
      \node[draw,below=0.5ex of states] (clstr) {Clusters};
      \node[draw,fit={(sched) (states) (clstr)}] {};

      \node[draw,right=11em of states] (symb) {Interpreter};
      \draw[-angle 45] (states.north east) to[in=90,out=15]
        node[sloped,above,pos=0.4]{Selected state} (symb.north west);
      \draw[-angle 45] (symb.south west) to[in=0,out=-90]
        node[sloped,above,pos=0.6]{Child state} (states.east);

      \node[draw,below=6em of symb] (smt) {Theorem prover};
      \draw[angle 45-angle 45] (symb) to (smt);

      \node[draw,below=7em of sched,label={[name=diff]Differentiator}]
        (outs) {Fruitful states};
      \draw[-angle 45] (symb.south west) to[in=45,out=-90]
        node[sloped,above,pos=0.6]{State with output} (outs.15);
      \draw[-angle 90] (outs) to[in=90,out=0,pos=0.4]
        node[sloped,below]{Path conditions} (smt.north west);
      \draw (outs.south east) to[in=90,out=10] (smt.north west);

      \node[draw,below=0.5ex of outs] (prog) {Binary};
      \node[draw,fit={(diff) (outs) (prog)}] {};
      \draw[-angle 45] (smt.south west) to[out=-90,in=0]
        node[sloped,below]{Concrete input} (prog);
      \draw[-angle 45] (prog) to[out=180,in=-150]
        node[left,align=center]{Diff.\\test} (clstr.south west);

      \node[left=4em of states.west] (bc) {Bitcode};
      \draw[-angle 90] (bc) to[in=180,out=60] (states.west);
      \node[below=1em of bc,align=center] (tree) {Decision\\tree};
      \draw[-angle 45] (clstr.west) to[out=180,in=60] (tree);
      \node[above=1em of bc,align=center] (seed) {Seed\\input};
      \draw[dotted] (seed) to[out=0,in=180] (states);
      \draw[-angle 45] (bc.south west)
        to[out=-135,in=-135] (prog.south west);
      \node[below=12em of sched.south east] (symb) {\captiontext{}};
    \end{tikzpicture}
  \end{subcaptiongroup}
  \captionsetup{subrefformat=parens}
  \caption{Architecture overviews of \subref{klee-arch} \klee{}
           and \subref{psychic-arch} \psychic{} extension.}\label{arch}
\end{figure}

\begin{algorithm}
  \caption{Overall execution of \psychic{}}\label{overall}
  \begin{algorithmic}[1]
    \Require Meta program $P$
    \Ensure Set of differential tests $T$
    \State $T \gets \varnothing$
    \State $X \gets \varnothing$ \Comment{Halt states}
    \State $S \gets \textsc{InitialState}(P)$
    \While{$S \neq \varnothing$}
      \State $i, e, \varPhi, n \gets \textsc{Select}(S)$
        \Comment{instruction, environment,
                 path constraints, and revision number}
      \State $S \gets S \setminus \{(i, e, \varPhi, n)\}$
      \State $i', e' \gets \textsc{Execute}(i, e)$
      \If{$\textsc{IsHalt}(i)$}
        \For{$(\varPhi', n') \in X$}
          \If{$n \ne n'$}
            \State $t \gets \textsc{DiffTest}(\varPhi, n, \varPhi', n')$
            \If{$t \ne \bot$}
              $T \gets T \cup \{t\}$
            \EndIf
          \EndIf
        \EndFor
        \State TODO: describe clustering
      \ElsIf{$\textsc{IsBranch}(i)$}
        \State $\varphi, i_c', i_a' \gets i'$
          \Comment{condition, consequent, and alternative}
        \If{$\textsc{IsMeta}(\varphi)$}
          \State $n' \gets \textsc{RevisionNumber}(\varphi)$
          \If{$n = 0 \lor n' = 0$}
            $S \gets S \cup \{(i_c', e', \varPhi, n')\}$
            \Comment{at most one simultaneous patch}
          \EndIf
          \State $S \gets S \cup \{(i_a', e', \varPhi, n)\}$
        \Else
          \If{$\textsc{sat}(\varPhi \land \varphi)$}
            $S \gets S \cup \{(i_c', e', \varPhi \land \varphi, n)\}$
          \EndIf
          \If{$\textsc{sat}(\varPhi \land \lnot\varphi)$}
            $S \gets S \cup \{(i_a', e', \varPhi \land \lnot\varphi, n)\}$
          \EndIf
        \EndIf
      \Else
        \If{$\textsc{IsReturn}(i)$}
          \State $s' \gets i', e', \varPhi \land \textsc{FunctionOutputs}(i)$
        \Else
          \State $s' \gets i', e', \varPhi$
        \EndIf
        \State $S \gets S \cup \{s'\}$
      \EndIf
    \EndWhile
  \end{algorithmic}
\end{algorithm}

\subsection{Meta program}
The meta program format takes inspiration from program unification
used for shadow symbolic execution.  Unlike \textsc{Shadow}
which tests only one patch~\cite{shadow} to reveal its regressions, though,
the goal of \psychic{} is to find the difference in behavior
between multiple patches.  Therefore, the former's annotation
at level of expressions does not meet our expectation on expressivity.
Instead, we implement higher-order branching.  Annotating patches
at block level simplifies the generation process
and improves the readability of the meta program.

\begin{figure}\centering
  \begin{subfigure}{0.35\textwidth}
\begin{alltt}
\textcolor{purple}{---} dfasearch.c	\textcolor{purple}{c1cb19fe}
\textcolor{blue}{+++} dfasearch.c	\textcolor{blue}{8f08d8e2}
@@ \textcolor{purple}{-360,2} \textcolor{blue}{+360,1} @@
\textcolor{purple}{-beg = match;}
\textcolor{purple}{-goto success_in_len;}
\textcolor{blue}{+goto success;}\end{alltt}
    \subcaption{}\label{8f08d8e2}
  \end{subfigure}
  \begin{subfigure}{0.35\textwidth}
\begin{alltt}
\textcolor{purple}{---} dfasearch.c	\textcolor{purple}{c1cb19fe}
\textcolor{violet}{+++} dfasearch.c	\textcolor{violet}{YmI4MGM3ND}
@@ \textcolor{purple}{-360,2} \textcolor{violet}{+360,2} @@
 beg = match;
\textcolor{purple}{-goto success_in_len;}
\textcolor{violet}{+goto success;}\end{alltt}
    \subcaption{}\label{YmI4MGM3ND}
  \end{subfigure}
  \captionsetup{subrefformat=parens}
  \caption{Unified format of \subref{8f08d8e2} a fix
           in \texttt{grep}'s commit \texttt{8f08d8e2},
           and \subref{YmI4MGM3ND} an alternative patch.}\label{grep-8f08d8e2}
\end{figure}

For instance, given the two unified diffs~\cite{unidiff} of a bug fix
from upstream \texttt{grep}, and a patch in \textsc{DBGBench}~\cite{dbgbench}
in figure~\ref{grep-8f08d8e2},\footnote{The snippets have been dedented
to fit within the width of this page.} one can construct the relevant part
of the meta program in figure~\ref{grep-8f08d8e2-meta}, where \verb|__klee_meta|
parses the environment for the choice of revision when executed
concretely~\cite{msv} and returns a new symbolic value in symbolic evaluation.
This way, the meta program can be used for both executions to share metadata
such as the patch number (82 and 43 in this case).

\begin{figure}\centering
\begin{varwidth}{\linewidth}\begin{alltt}
int __klee_meta2 = __klee_meta(2);
if (__klee_meta(2) == \textcolor{purple}{0}) \verb|{| // original buggy version
  \textcolor{purple}{beg = match;}
\verb|}| else if (__klee_meta2 == \textcolor{blue}{82}) \verb|{| // upstream patch
\verb|}| else if (__klee_meta2 == \textcolor{violet}{43}) \verb|{| // alternative patch
  \textcolor{violet}{beg = match;}
\verb|}|
int __klee_meta3 = __klee_meta(3);
if (__klee_meta(3) == \textcolor{purple}{0}) \verb|{|
  \textcolor{purple}{goto success_in_len;}
\verb|}| else if (__klee_meta3 == \textcolor{blue}{82}) \verb|{|
  \textcolor{blue}{goto success;}
\verb|}| else if (__klee_meta3 == \textcolor{violet}{43}) \verb|{|
  \textcolor{violet}{goto success;}
\verb|}|\end{alltt}\end{varwidth}
\caption{Meta branches with the fixes in \texttt{grep}'s upstream
         and alternative patches in figure~\ref{grep-8f08d8e2}.}
\label{grep-8f08d8e2-meta}
\end{figure}

One may notice that the meta program may have any level of granularity.
\psychic{} utilizes the meta program to maximize instructions
and path constraints sharing, which effectively means to defer
the divergence point deep into the control flow.  For its intended use
on automatically generated patches whose hunks are commonly short,
optimizing the meta program construction is not a major issue.

In algorithm~\ref{overall}, \textsc{IsMeta} cheaply detects
each higher-order branch via basic pattern matching and skip
adding its condition to the children states.  Instead,
those states carry the revision number outside of their path constraints
for constant-time access when pruning paths going though multiple patches.

\subsection{Symbolic output selection}
\psychic{} is based on \klee{} and thus inherits its support
for the diverse set of programs written in languages with C \gls{ffi}
for calling \klee{} intrinsics and a compiler using LLVM as a backend.
One kind of LLVM instruction may be used for difference purposes,
so for better applicability we try to accommodate for a range of usages.
For capturing output values, \psychic{} offers the following methods.
\pagebreak
\begin{enumerate}[noitemsep]
  \item Source code annotation\label{annote}
  \item Output files (including standard output)\label{stdout}
  \item Function return values\label{ret}
  \item Pointers to mutable data as function arguments\label{parg}
\end{enumerate}

Method \ref{annote} is handled similar to previous works
described in section~\ref{klee-use}.  With \klee{} support for symbolic files,
the same avatar idiom is trivially applied for~\ref{stdout}.  However,
since files are byte buffers, their application in symbolic execution
is limited due to the scalability of \gls{smt} solvers, which treat
each byte as a variable.  For human-readable files, this is particularly
inefficient due to their low information entropy~\cite{entropy}.

To balance between manual annotation and computational scalability,
methods \ref{ret} and \ref{parg} are introduced to automatically capture output
more precisely at the end of every interested function.  Debugging information
forwarded from compiler frontend are kept in bitcodes, and they are used
to limit the subroutines to be monitored, for example in algorithm~\ref{fnout},
the file that is patched by an \gls{apr} tool.

\begin{algorithm}
  \caption{Extraction of symbolic outputs of a function}\label{fnout}
  \begin{algorithmic}[1]
    \Procedure{FunctionOutputs}{$i$}
      \State $f \gets \textsc{Function}(i)$
      \If{$\textsc{SourceFile}(i) \neq \textsc{PatchedFile}$}
        \Return $\top$
      \EndIf
      \State $\varPhi \gets \textsc{SymbolicOutput}(\textsc{ReturnValue}(i), f)$
      \For{$a \in \textsc{FunctionArguments}(i)$}
        \If{$\textsc{IsPointer}(a) \land \lnot\textsc{IsConstPointer}(a)
                                   \land \lnot\textsc{IsFunctionPointer}(a)$}
          \State $\varPhi \gets \varPhi
            \land \textsc{SymbolicOutput}(\textsc{Dereference}(a), a)$
        \EndIf
      \EndFor
      \State \Return $\varPhi$
    \EndProcedure
  \end{algorithmic}
\end{algorithm}

Each symbolic output then is given an unique name for later comparison.
For the sake of simplicity, we name them after the capturing method
and associated identifier, i.e.~variable or subroutine.

\subsection{Input generation}
When two states monitor some commonly named symbolic outputs
and are at \emph{parallel} positions in their path, their path constraints
can be used to generate a \gls{difftest}.  Two states are said to be parallel
if and only if their program counter points to the same instruction
and the conjunction of their path conditions is satisfiable.

Currently \psychic{} only compare halt states to minimize memory usage
and \gls{smt} solving cost.  Such \gls{smt} formula is constructed
in algorithm~\ref{diffgen} by first append a suffix to each output name,
then for the common ones assert for possible distinction between two paths.
If the path constraints of both states are \gls{sat} with at least
one difference in output, a \gls{difftest} can be generated
from the \gls{smt} model.

\begin{algorithm}
  \caption{\Gls{difftest} generation}\label{diffgen}
  \begin{algorithmic}[1]
    \Procedure{DiffTest}{$\varPhi, n, \varPhi', n'$}
      \State $\varPsi, N \gets \textsc{RenameOutputs}(\varPhi, n)$
      \State $\varPsi', N' \gets \textsc{RenameOutputs}(\varPhi', n')$
      \For{$a \in N \cap N'$}
        \State $\varPsi_\text{diff}
          \gets \varPsi_\text{diff} \lor \textsc{Distinct}(a, n, n')$
      \EndFor
      \If{$\lnot\textsc{sat}\left(\varPsi \land \varPsi'
                                  \land \varPsi_\text{diff}\right)$}
        \State \Return $\bot$
      \EndIf
      \State TODO: describe concrete execution
      \State \Return $\textsc{Model}\left(\varPsi \land \varPsi'
                                          \land \varPsi_\text{diff}\right)$
    \EndProcedure
    \Procedure{RenameOutputs}{$e, s$}
      \State $E, k \gets e$ \Comment{subexpressions and kind}
      \If{$\textsc{IsRead}(t)$}
        \State $a, i \gets E$ \Comment{array name and index}
        \If{$\textsc{IsOutput}(a)$}
          \State $a' \gets \textsc{AppendName}(a, s)$
          \State $e' \gets ((a', i), k)$
          \State \Return $e', \{a\}$
        \Else
          \State \Return $e, \varnothing$
        \EndIf
      \EndIf
      \State $E' \gets E$
      \State $N \gets \varnothing$ \Comment{seen output names}
      \For{$i \in 1{.}\,{.}\textsc{Length}(E)$}
        \State $E'_i, N_+ \gets \textsc{RenameOutputs}(E_i, s)$
        \State $N \gets N \cup N_+$
      \EndFor
      \State \Return $(E', k), N$
    \EndProcedure
  \end{algorithmic}
\end{algorithm}