technique.tex


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387

\section{Technique}
In order to generate test cases to distinguish between multiple patches,
our tool extends symbolic execution in a number of ways.  First, instead
of taking in a single program, all of its revisions are passed to the tool.
Since each patch only modifies a small portion of the program, most of the code
is shared among these revisions, hence they are combined into
a \emph{meta program} to avoid repeating execution.

Second, we combine the path constraints in execution \glspl{state}
of different revisions in a certain way before feeding to a \gls{smt} solver
to generate \glspl{difftest}.  Third, the new conditions the combined
\gls{smt} formula includes (aside from the conjunction of its parents)
are assertions on the distinction of corresponding values across revisions,
so we offer multiple options to capture such \emph{\glspl{symout}}.

Next, we introduce concrete execution to reduce the theorem proving load,
and produce \glspl{difftest} for clustering the patches and finally produce
a decision tree.  One may observe that the clustering information
can help with selecting more ``promising'' states, although such scheduling
enhancement is not covered by this work.

We implemented these extensions on top of \klee{} in a tool
named \psychic{}, whose high-level architectures are compared
in figure~\ref{arch}.  The overall process of \psychic{}
is denoted in algorithm~\ref{overall}.

\begin{figure}[h]
  \begin{subcaptiongroup}
    \subcaptionlistentry{KLEE}\label{klee-arch}
    \begin{tikzpicture}[mymatrix/.style={matrix of nodes,
                                         nodes=typetag,row sep=1ex},
                        typetag/.style={draw,anchor=west},
                        title/.style={draw=none,inner sep=0pt},
                        line width=0.2ex]
      \linespread{0.8}
      \node[draw,label={[name=sched]Scheduler}] (states) {States};
      \node[draw,fit={(sched) (states)}] {};

      \node[draw,right=11em of states] (symb) {Interpreter};
      \draw[-angle 45] (states.north east) to[in=90,out=15]
        node[sloped,above,pos=0.4]{Selected state} (symb.north west);
      \draw[-angle 45] (symb.south west) to[in=0,out=-90]
        node[sloped,above,pos=0.6]{Child state} (states.east);

      \node[draw,below=3em of symb] (smt) {Theorem prover};
      \draw[angle 45-angle 45] (symb) to (smt);

      \node[left=4em of states.west] (bc) {Bitcode};
      \draw[-angle 90] (bc) to[in=180,out=60] (states.west);
      \node[above=1em of bc,align=center] (seed) {Seed\\input};
      \draw[dotted] (seed) to[out=0,in=180] (states);
      \node[below=1em of bc,align=center] (tests) {Test\\cases};
      \draw[-angle 45] (smt.west) to[in=0,out=180] (tests);
      \node[below=5em of sched.south east] (symb) {\captiontext{}};
    \end{tikzpicture}
    \vspace{-2em}
    \subcaptionlistentry{Psychic}\label{psychic-arch}
    \begin{tikzpicture}[mymatrix/.style={matrix of nodes,
                                         nodes=typetag,row sep=1ex},
                        typetag/.style={draw,anchor=west},
                        title/.style={draw=none,inner sep=0pt},
                        line width=0.2ex]
      \linespread{0.8}
      \node[draw,label={[name=sched]Scheduler}] (states) {States};
      \node[draw,below=0.5ex of states] (clstr) {Clusters};
      \node[draw,fit={(sched) (states) (clstr)}] {};

      \node[draw,right=11em of states] (symb) {Interpreter};
      \draw[-angle 45] (states.north east) to[in=90,out=15]
        node[sloped,above,pos=0.4]{Selected state} (symb.north west);
      \draw[-angle 45] (symb.south west) to[in=0,out=-90]
        node[sloped,above,pos=0.6]{Child state} (states.east);

      \node[draw,below=6em of symb] (smt) {Theorem prover};
      \draw[angle 45-angle 45] (symb) to (smt);

      \node[draw,below=7em of sched,label={[name=diff]Differentiator}]
        (outs) {Fruitful states};
      \draw[-angle 45] (symb.south west) to[in=45,out=-90]
        node[sloped,above,pos=0.6]{State with output} (outs.15);
      \draw[-angle 90] (outs) to[in=90,out=0,pos=0.4]
        node[sloped,below]{Path conditions} (smt.north west);
      \draw (outs.south east) to[in=90,out=10] (smt.north west);

      \node[draw,below=0.5ex of outs] (prog) {Binary};
      \node[draw,fit={(diff) (outs) (prog)}] {};
      \draw[-angle 45] (smt.south west) to[out=-90,in=0]
        node[sloped,below]{Concrete input} (prog);
      \draw[-angle 45] (prog) to[out=180,in=-150]
        node[left,align=center]{Diff.\\test} (clstr.south west);

      \node[left=4em of states.west] (bc) {Bitcode};
      \draw[-angle 90] (bc) to[in=180,out=60] (states.west);
      \node[below=1em of bc,align=center] (tree) {Decision\\tree};
      \draw[-angle 45] (clstr.west) to[out=180,in=60] (tree);
      \node[above=1em of bc,align=center] (seed) {Seed\\input};
      \draw[dotted] (seed) to[out=0,in=180] (states);
      \draw[-angle 45] (bc.south west)
        to[out=-135,in=-135] (prog.south west);
      \node[below=12em of sched.south east] (symb) {\captiontext{}};
    \end{tikzpicture}
  \end{subcaptiongroup}
  \captionsetup{subrefformat=parens}
  \caption{Architecture overviews of \subref{klee-arch} \klee{}
           and \subref{psychic-arch} \psychic{} extension.}\label{arch}
\end{figure}

\begin{algorithm}
  \caption{Overall execution of \psychic{}}\label{overall}
  \begin{algorithmic}[1]
    \Require Meta program $P$
    \Ensure Set of differential tests $T$
    \State $T \gets \varnothing$
    \State $D \gets \varnothing$ \Comment{Distinguished revision pairs}
    \State $H \gets \varnothing$ \Comment{Halt states}
    \State $S \gets \Call{InitialState}{P}$
    \While{$S \neq \varnothing$}
      \State $(i, e, \varPhi, n) \gets \Call{Select}{S}$
        \Comment{instruction, environment,
                 path constraints, and revision number}
      \State $S \gets S \setminus \{(i, e, \varPhi, n)\}$
      \State $(i', e') \gets \Call{Execute}{i, e}$
      \If{$\Call{IsHalt}{i}$}
        \For{$(\varPhi', n') \in H$}
          \State $t \gets \Call{DiffTest}{D, \varPhi, n, \varPhi', n'}$
          \If{$t \ne \bot$}
            \State $T \gets T \cup \{t\}$
            \State $D \gets D \cup \Call{DistinguishPairs}{t}$
          \EndIf
        \EndFor
        \State $H \gets H \cup \{(\varPhi, n)\}$
      \ElsIf{$\Call{IsBranch}{i}$}
        \State $(\varphi, i_c', i_a') \gets i'$
          \Comment{condition, consequent, and alternative}
        \If{$\Call{IsMeta}{\varphi}$}
          \State $S \gets S
            \cup \Call{ApplyMetaBranch}{\varphi, i_c', i_a', e', \varPhi, n}$
        \Else
          \If{$\Call{sat}{\varPhi \land \varphi}$}
            $S \gets S \cup \{(i_c', e', \varPhi \land \varphi, n)\}$
          \EndIf
          \If{$\Call{sat}{\varPhi \land \lnot\varphi}$}
            $S \gets S \cup \{(i_a', e', \varPhi \land \lnot\varphi, n)\}$
          \EndIf
        \EndIf
      \Else
        \If{$\Call{IsReturn}{i}$}
          \State $\varPhi' \gets \varPhi \land \Call{FunctionOutputs}{i}$
        \Else
          \State $\varPhi' \gets \varPhi$
        \EndIf
        \State $S \gets S \cup \{(i', e', \varPhi')\}$
      \EndIf
    \EndWhile
  \end{algorithmic}
\end{algorithm}

\subsection{Meta program}
The meta program format takes inspiration from program unification
used for shadow symbolic execution.  Unlike \textsc{Shadow}
which tests only one patch~\cite{shadow} to reveal its regressions, though,
the goal of \psychic{} is to find the difference in behavior
between multiple patches.  Therefore, the former's annotation
at the level of expressions does not meet our expectation on expressivity.
Instead, we implement higher-order branching.  Annotating patches
at block level simplifies the generation process
and improves the readability of the meta program.

\begin{figure}\centering
  \begin{subfigure}{0.35\textwidth}
\begin{alltt}
\textcolor{purple}{---} dfasearch.c	\textcolor{purple}{c1cb19fe}
\textcolor{blue}{+++} dfasearch.c	\textcolor{blue}{8f08d8e2}
@@ \textcolor{purple}{-360,2} \textcolor{blue}{+360,1} @@
\textcolor{purple}{-beg = match;}
\textcolor{purple}{-goto success_in_len;}
\textcolor{blue}{+goto success;}\end{alltt}
    \subcaption{}\label{8f08d8e2}
  \end{subfigure}
  \begin{subfigure}{0.35\textwidth}
\begin{alltt}
\textcolor{purple}{---} dfasearch.c	\textcolor{purple}{c1cb19fe}
\textcolor{violet}{+++} dfasearch.c	\textcolor{violet}{YmI4MGM3ND}
@@ \textcolor{purple}{-360,2} \textcolor{violet}{+360,2} @@
 beg = match;
\textcolor{purple}{-goto success_in_len;}
\textcolor{violet}{+goto success;}\end{alltt}
    \subcaption{}\label{YmI4MGM3ND}
  \end{subfigure}
  \captionsetup{subrefformat=parens}
  \caption{Unified format of \subref{8f08d8e2} a fix
           in \texttt{grep}'s commit \texttt{8f08d8e2},
           and \subref{YmI4MGM3ND} an alternative patch.}\label{grep-8f08d8e2}
\end{figure}

For instance, given the two unified diffs~\cite{unidiff} of a bug fix
from upstream \texttt{grep}, and a patch in \textsc{DBGBench}~\cite{dbgbench}
in figure~\ref{grep-8f08d8e2},\footnote{These snippets have been outdented
to fit within the width of this page.} one can construct the relevant part
of the meta program in figure~\ref{grep-8f08d8e2-meta}, where \verb|__klee_meta|
parses the environment for the choice of revision when executed
concretely~\cite{msv} and returns a new symbolic value in symbolic evaluation.
This way, the meta program can be used for both executions to share metadata
such as the patch identifier (82 and 43 in this case).

\begin{figure}\centering
\begin{varwidth}{\linewidth}\begin{alltt}
int __klee_meta2 = __klee_meta(2);
if (__klee_meta(2) == \textcolor{purple}{0}) \verb|{| // original buggy version
  \textcolor{purple}{beg = match;}
\verb|}| else if (__klee_meta2 == \textcolor{blue}{82}) \verb|{| // upstream patch
\verb|}| else if (__klee_meta2 == \textcolor{violet}{43}) \verb|{| // alternative patch
  \textcolor{violet}{beg = match;}
\verb|}|
int __klee_meta3 = __klee_meta(3);
if (__klee_meta(3) == \textcolor{purple}{0}) \verb|{|
  \textcolor{purple}{goto success_in_len;}
\verb|}| else if (__klee_meta3 == \textcolor{blue}{82}) \verb|{|
  \textcolor{blue}{goto success;}
\verb|}| else if (__klee_meta3 == \textcolor{violet}{43}) \verb|{|
  \textcolor{violet}{goto success;}
\verb|}|\end{alltt}\end{varwidth}
\caption{Meta branches with the fixes in \texttt{grep}'s upstream
         and alternative patches in figure~\ref{grep-8f08d8e2}.}
\label{grep-8f08d8e2-meta}
\end{figure}

One may notice that the meta program may have any level of granularity.
\psychic{} utilizes the meta program to maximize instructions
and path constraints that are shared, which effectively means to defer
the divergence point deep into the control flow.  For its intended use
on automatically generated patches whose hunks are commonly short,
optimizing the meta program construction is not a major issue.

In algorithm~\ref{overall}, \textsc{IsMeta} cheaply detects
higher-order branching patch identifier, via basic tree pattern matching%
~\cite{tree-pattern} and skip adding its condition to the children states.
Instead, those states carry the revision number outside
of their path constraints for constant-time access when pruning paths
going through multiple patches in algorithm~\ref{apply-meta}.

\begin{algorithm}
  \caption{Construction of possible states after a patch location}
  \label{apply-meta}
  \begin{algorithmic}[1]
    \Procedure{ApplyMetaBranch}{$\varphi, i_c, i_a, e, \varPhi, n$}
      \State $n' \gets \Call{RevisionNumber}{\varphi}$
      \State $s_a = (i_a, e, \varPhi, n)$
        \Comment{assume the lack of meta branch nesting
                 and always follow alternative paths}
      \If{$n = 0$}
        \Return $\{(i_c, e, \varPhi, n'), s_a\}$
        \Comment{$i_c$ enters patch $n'$ }
      \EndIf
      \If{$n' \ne 0 \land n \ne n'$}
        \Return $\{s_a\}$
        \Comment{at most one simultaneous patch}
      \EndIf
      \State \Return $\{(i_c, e, \varPhi, n), s_a\}$
    \EndProcedure
  \end{algorithmic}
\end{algorithm}

\subsection{Symbolic output selection}
\psychic{} is based on \klee{} and thus inherits its support
for the diverse set of programs written in languages with C \gls{ffi}
for calling \klee{} intrinsics and a compiler using LLVM as a backend.
One kind of LLVM instruction may be used for difference purposes,
so for better applicability we try to accommodate for a range of usages.
For capturing output values, \psychic{} offers the following methods.
\begin{enumerate}[noitemsep]
  \item Source code annotation\label{annote}
  \item Output files (including standard output)\label{stdout}
  \item Function return values\label{ret}
  \item Pointers to mutable data as function arguments\label{parg}
\end{enumerate}

Method \ref{annote} is handled similar to previous works
described in section~\ref{klee-use}.  With \klee{} support for symbolic files,
the same avatar idiom is trivially applied for~\ref{stdout}.  However,
since files are byte buffers, their application in symbolic execution
is limited due to the scalability of \gls{smt} solvers, which treat each byte
as a variable.  For human-readable files, this is particularly inefficient
due to their low information entropy~\cite{entropy}.  On the other hand,
because of imperfect fault localization either by human~\cite{dbgbench}
or automatically~\cite{fl-eval}, output annotation has limited exhaustiveness.

To balance between manual annotation and computational scalability,
methods \ref{ret} and \ref{parg} are introduced to automatically capture output
more precisely at the end of every interested function.  Debugging information
forwarded from compiler frontend are kept in bitcodes.  They are used
to limit the subroutines to be monitored, for example in algorithm~\ref{fnout},
the file that is patched by an \gls{apr} tool.

\begin{algorithm}
  \caption{Extraction of \glspl{symout} of a function}\label{fnout}
  \begin{algorithmic}[1]
    \Procedure{FunctionOutputs}{$i$}
      \State $f \gets \Call{Function}{i}$
      \If{$f$ is not in the patched file}
        \Return $\top$
      \EndIf
      % Can't do nested calls LOL
      \State $\varPhi \gets \Call{SymbolicOutput}{\textsc{ReturnValue}(i), f}$
      \For{$a \in \Call{FunctionArguments}{i}$}
        \If{$\Call{IsPointer}{a} \land \lnot\Call{IsConstPointer}{a}
                                 \land \lnot\Call{IsFunctionPointer}{a}$}
          \State $\varPhi \gets \varPhi
            \land \Call{SymbolicOutput}{\textsc{Dereference}(a), a}$
        \EndIf
      \EndFor
      \State \Return $\varPhi$
    \EndProcedure
  \end{algorithmic}
\end{algorithm}

Each \gls{symout} then is given a unique name for later comparison.
For the sake of simplicity, we name them after the capturing method
and associated identifier, i.e.~variable or subroutine.

\subsection{Input generation}
When two states monitor some commonly named \glspl{symout}
and are at \emph{parallel} positions in their path, their path constraints
can be used to generate a \gls{difftest}.  Two states are said to be parallel
if and only if their program counter points to the same instruction
and the conjunction of their path conditions is satisfiable.

Currently \psychic{} only compare halt states to minimize memory usage
and \gls{smt} solving cost.  Such \gls{smt} formula is constructed
in algorithm~\ref{diffgen} by first append a suffix to each output name,
then for the common ones assert for a possible distinction between two paths.
If the path constraints of both states are \gls{sat} with at least
one difference in outputs, a \gls{difftest} can be generated
from the \gls{smt} model.

Halt states are reached naturally at the program's termination or an error.
The latter category also includes sanitizer traps for \gls{ub}.

\begin{algorithm}
  \caption{\Gls{difftest} generation}\label{diffgen}
  \begin{algorithmic}[1]
    \Procedure{DiffTest}{$D, \varPhi, n, \varPhi', n'$}
      \If{$n \ne n' \lor \{n, n'\} \in D$}
        \Return $\bot$
      \EndIf
      \State $\varPsi, N \gets \Call{RenameOutputs}{\varPhi, n}$
      \State $\varPsi', N' \gets \Call{RenameOutputs}{\varPhi', n'}$
      \State $\varPsi_\text{diff}
        \gets \bigvee_{a \in N \cap N'}\Call{Distinct}{a, n, n'}$
      \If{$\lnot\Call{sat}{\varPsi \land \varPsi' \land \varPsi_\text{diff}}$}
        \Return $\bot$
      \EndIf
      \State $m \gets \Call{Model}{\varPsi \land \varPsi'
                                   \land \varPsi_\text{diff}}$
      \State $I \gets \Call{InputValues}{m}$
      \State \Return $(m,
        \{\Call{Execute}{I, r} \mid r \in \textsc{revisions}\})$
    \EndProcedure
    \Procedure{RenameOutputs}{$x, n$}
      \State $X, k \gets x$ \Comment{subexpressions and kind}
      \If{$\Call{IsRead}{t}$}
        \State $a, i \gets X$ \Comment{array name and index}
        \If{$\Call{IsOutput}{a}$}
          \State $a' \gets \Call{AppendName}{a, n}$
          \State $x' \gets ((a', i), k)$
          \State \Return $x', \{a\}$
        \Else
          \State \Return $x, \varnothing$
        \EndIf
      \EndIf
      \State $X' \gets X$
      \State $N \gets \varnothing$ \Comment{seen output names}
      \For{$i \in 1{.}\,{.}\Call{Length}{X}$}
        \State $X'_i, N_+ \gets \Call{RenameOutputs}{E_i, n}$
        \State $N \gets N \cup N_+$
      \EndFor
      \State \Return $(X', k), N$
    \EndProcedure
  \end{algorithmic}
\end{algorithm}

\subsection{Decision tree construction}
From each \gls{difftest}, a set of clusters of revisions with the same output
is constructed.  We then superimpose these clustering sets to create
hierarchical clusterings or a decision tree.  A minimal tree, i.e.~a tree
with the shortest height, is found by searching through the subsets
of the set of clustering sets.