New test system to detect side channels in crypto code.
All the work I've put in in the last few months to eliminate timing
and cache side channels from PuTTY's mp_int and cipher implementations
has been on a seat-of-the-pants basis: just thinking very hard about
what kinds of language construction I think would be safe to use, and
trying not to absentmindedly leave a conditional branch or a cast to
bool somewhere vital.
Now I've got a test suite! The basic idea is that you run the same
crypto primitive multiple times, with inputs differing only in ways
that are supposed to avoid being leaked by timing or leaving evidence
in the cache; then you instrument the code so that it logs all the
control flow, memory access and a couple of other relevant things in
each of those runs, and finally, compare the logs and expect them to
be identical.
The instrumentation is done using DynamoRIO, which I found to be well
suited to this kind of work: it lets you define custom modifications
of the code in a reasonably low-effort way, and it lets you work at
both the low level of examining single instructions _and_ the higher
level of the function call ABI (so you can give things like malloc
special treatment, not to mention intercepting communications from the
program being instrumented). Build instructions are all in the comment
at the top of testsc.c.
At present, I've found this test to give a 100% pass rate using gcc
-O0 and -O3 (Ubuntu 18.10). With clang, there are a couple of
failures, which I'll fix in the next commit.
2019-02-10 13:09:53 +00:00
|
|
|
/*
|
|
|
|
* sclog: the DynamoRIO instrumentation system that goes with the
|
|
|
|
* PuTTY test binary 'testsc'.
|
|
|
|
*
|
|
|
|
* For general discussion and build instructions, see the comment at
|
|
|
|
* the top of testsc.c.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <inttypes.h>
|
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
#include "dr_api.h"
|
|
|
|
#include "drmgr.h"
|
|
|
|
#include "drsyms.h"
|
|
|
|
#include "drreg.h"
|
|
|
|
#include "drutil.h"
|
|
|
|
#include "drwrap.h"
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The file we're currently logging to, if any.
|
|
|
|
*/
|
|
|
|
static file_t outfile = INVALID_FILE;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A counter which we can increment and decrement around any library
|
|
|
|
* function we don't want to log the details of what happens inside.
|
|
|
|
* Mainly this is for memory allocation functions, which will diverge
|
|
|
|
* control depending on the progress of their search for something
|
|
|
|
* they can allocate.
|
|
|
|
*/
|
|
|
|
size_t logging_paused = 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This log message appears at the start of whatever DynamoRIO
|
|
|
|
* considers a 'basic block', i.e. a sequence of instructions with no
|
|
|
|
* branches. Logging these is cheaper than logging every single
|
|
|
|
* instruction, and should still be adequate to detect any divergence
|
|
|
|
* of control flow.
|
|
|
|
*/
|
|
|
|
static void log_pc(const char *loc)
|
|
|
|
{
|
|
|
|
if (outfile == INVALID_FILE || logging_paused)
|
|
|
|
return;
|
|
|
|
dr_fprintf(outfile, "%s: start basic block\n", loc);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Hardware division instructions are unlikely to run in time
|
|
|
|
* independent of the data, so we log both their parameters.
|
|
|
|
*/
|
|
|
|
static void log_div(uint n, uint d, const char *loc)
|
|
|
|
{
|
|
|
|
if (outfile == INVALID_FILE || logging_paused)
|
|
|
|
return;
|
|
|
|
dr_fprintf(outfile, "%s: divide %"PRIuMAX" / %"PRIuMAX"\n",
|
|
|
|
loc, (uintmax_t)n, (uintmax_t)d);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Register-controlled shift instructions are not reliably one cycle
|
|
|
|
* long on all platforms, so we log the shift couhnt.
|
|
|
|
*/
|
|
|
|
static void log_var_shift(uint sh, const char *loc)
|
|
|
|
{
|
|
|
|
if (outfile == INVALID_FILE || logging_paused)
|
|
|
|
return;
|
|
|
|
dr_fprintf(outfile, "%s: var shift by %"PRIuMAX"\n", loc, (uintmax_t)sh);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We need to log memory accesses, so as to detect data-dependent
|
|
|
|
* changes in the access pattern (e.g. incautious use of a lookup
|
|
|
|
* table). But one thing we _can't_ control for perfectly is that in
|
|
|
|
* two successive runs of the same crypto primitive, malloc may be
|
|
|
|
* called, and may return different addresses - which of course is not
|
|
|
|
* dependent on the data (unless the size of the allocated block
|
|
|
|
* does).
|
|
|
|
*
|
|
|
|
* So we track all the memory allocations that happen during logging,
|
|
|
|
* and any addresses accessed within those blocks are logged as
|
|
|
|
* something along the lines of 'n bytes from the start of the mth
|
|
|
|
* allocation'.
|
|
|
|
*
|
|
|
|
* Allocations that happened before a given log file was opened are
|
|
|
|
* not tracked. The program under test will ensure that any of those
|
|
|
|
* used by the primitive are at the same address in all runs anyway.
|
|
|
|
*/
|
|
|
|
struct allocation {
|
|
|
|
/*
|
|
|
|
* We store the list of allocations in a linked list, so we can
|
|
|
|
* look them up by address, and delete them as they're freed.
|
|
|
|
*
|
|
|
|
* A balanced binary search tree would be faster, but this is
|
|
|
|
* easier to get right first time!
|
|
|
|
*/
|
|
|
|
struct allocation *prev, *next;
|
|
|
|
uintptr_t start, size, index;
|
|
|
|
};
|
|
|
|
static struct allocation alloc_ends[1] = { alloc_ends, alloc_ends, 0, 0, 0 };
|
|
|
|
static uintptr_t next_alloc_index = 0;
|
|
|
|
|
|
|
|
static void free_allocation(struct allocation *alloc)
|
|
|
|
{
|
|
|
|
alloc->next->prev = alloc->prev;
|
|
|
|
alloc->prev->next = alloc->next;
|
|
|
|
dr_global_free(alloc, sizeof(struct allocation));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wrap the log_set_file() function in testsc.c, and respond to it by
|
|
|
|
* opening or closing log files.
|
|
|
|
*/
|
|
|
|
static void wrap_logsetfile(void *wrapctx, void **user_data)
|
|
|
|
{
|
|
|
|
if (outfile) {
|
|
|
|
dr_close_file(outfile);
|
|
|
|
outfile = INVALID_FILE;
|
|
|
|
}
|
|
|
|
|
|
|
|
const char *outfilename = drwrap_get_arg(wrapctx, 0);
|
|
|
|
if (outfilename) {
|
|
|
|
outfile = dr_open_file(outfilename, DR_FILE_WRITE_OVERWRITE);
|
|
|
|
DR_ASSERT(outfile != INVALID_FILE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reset the allocation list to empty, whenever we open or close a
|
|
|
|
* log file.
|
|
|
|
*/
|
|
|
|
while (alloc_ends->next != alloc_ends)
|
|
|
|
free_allocation(alloc_ends->next);
|
|
|
|
next_alloc_index = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wrap the dry_run() function in testsc.c, to tell it we're here.
|
|
|
|
*/
|
|
|
|
static void wrap_dryrun(void *wrapctx, void *user_data)
|
|
|
|
{
|
|
|
|
drwrap_set_retval(wrapctx, (void *)0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Look up the memory allocation record corresponding to an address.
|
|
|
|
*/
|
|
|
|
static struct allocation *find_allocation(const void *ptr)
|
|
|
|
{
|
|
|
|
uintptr_t address = (uintptr_t)ptr;
|
|
|
|
for (struct allocation *alloc = alloc_ends->next;
|
|
|
|
alloc != alloc_ends; alloc = alloc->next) {
|
|
|
|
if (alloc && address - alloc->start < alloc->size)
|
|
|
|
return alloc;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Log a memory access.
|
|
|
|
*/
|
|
|
|
static void log_mem(app_pc addr, uint size, uint write, const char *loc)
|
|
|
|
{
|
|
|
|
if (outfile == INVALID_FILE || logging_paused)
|
|
|
|
return;
|
|
|
|
|
|
|
|
struct allocation *alloc = find_allocation((const void *)addr);
|
|
|
|
if (!alloc) {
|
|
|
|
dr_fprintf(outfile, "%s: %s %"PRIuMAX" @ %"PRIxMAX"\n",
|
|
|
|
loc, write ? "store" : "load", (uintmax_t)size,
|
|
|
|
(uintmax_t)addr);
|
|
|
|
} else {
|
|
|
|
dr_fprintf(outfile, "%s: %s %"PRIuMAX" @ allocations[%"PRIuPTR"]"
|
|
|
|
" + %"PRIxMAX"\n",
|
|
|
|
loc, write ? "store" : "load", (uintmax_t)size,
|
|
|
|
alloc->index, (uintmax_t)(addr - alloc->start));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Record the allocation of some memory. (Common code between malloc
|
|
|
|
* and realloc.)
|
|
|
|
*/
|
|
|
|
static void allocated(void *ptr, size_t size)
|
|
|
|
{
|
|
|
|
if (outfile == INVALID_FILE)
|
|
|
|
return; /* no need to track allocations outside a logging interval */
|
|
|
|
|
|
|
|
struct allocation *alloc = dr_global_alloc(sizeof(struct allocation));
|
|
|
|
alloc->start = (uintptr_t)ptr;
|
|
|
|
alloc->size = size;
|
|
|
|
alloc->index = next_alloc_index++;
|
|
|
|
alloc->prev = alloc_ends->prev;
|
|
|
|
alloc->next = alloc_ends;
|
|
|
|
alloc->prev->next = alloc->next->prev = alloc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Record that memory has been freed. Note that we may free something
|
|
|
|
* that was allocated when we weren't logging, so we must cope with
|
|
|
|
* find_allocation returning NULL.
|
|
|
|
*/
|
|
|
|
static void freed(void *ptr)
|
|
|
|
{
|
|
|
|
struct allocation *alloc = find_allocation(ptr);
|
|
|
|
if (alloc)
|
|
|
|
free_allocation(alloc);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The actual wrapper functions for malloc, realloc and free.
|
|
|
|
*/
|
|
|
|
static void wrap_malloc_pre(void *wrapctx, void **user_data)
|
|
|
|
{
|
|
|
|
logging_paused++;
|
|
|
|
*user_data = drwrap_get_arg(wrapctx, 0);
|
2020-12-13 12:31:42 +00:00
|
|
|
dr_fprintf(outfile, "malloc %"PRIuMAX"\n", (uintmax_t)*user_data);
|
New test system to detect side channels in crypto code.
All the work I've put in in the last few months to eliminate timing
and cache side channels from PuTTY's mp_int and cipher implementations
has been on a seat-of-the-pants basis: just thinking very hard about
what kinds of language construction I think would be safe to use, and
trying not to absentmindedly leave a conditional branch or a cast to
bool somewhere vital.
Now I've got a test suite! The basic idea is that you run the same
crypto primitive multiple times, with inputs differing only in ways
that are supposed to avoid being leaked by timing or leaving evidence
in the cache; then you instrument the code so that it logs all the
control flow, memory access and a couple of other relevant things in
each of those runs, and finally, compare the logs and expect them to
be identical.
The instrumentation is done using DynamoRIO, which I found to be well
suited to this kind of work: it lets you define custom modifications
of the code in a reasonably low-effort way, and it lets you work at
both the low level of examining single instructions _and_ the higher
level of the function call ABI (so you can give things like malloc
special treatment, not to mention intercepting communications from the
program being instrumented). Build instructions are all in the comment
at the top of testsc.c.
At present, I've found this test to give a 100% pass rate using gcc
-O0 and -O3 (Ubuntu 18.10). With clang, there are a couple of
failures, which I'll fix in the next commit.
2019-02-10 13:09:53 +00:00
|
|
|
}
|
Side-channel tester: align memory allocations.
While trying to get an upcoming piece of code through testsc, I had
trouble - _yet again_ - with the way that control flow diverges inside
the glibc implementations of functions like memcpy and memset,
depending on the alignment of the input blocks _above_ the alignment
guaranteed by malloc, so that doing the same sequence of malloc +
memset can lead to different control flow. (I believe this is done
either for cache performance reasons or SIMD alignment requirements,
or both: on x86, some SIMD instructions require memory alignment
beyond what malloc guarantees, which is also awkward for our x86
hardware crypto implementations.)
My previous effort to normalise this problem out of sclog's log files
worked by wrapping memset and all its synonyms that I could find. But
this weekend, that failed for me, and the reason appears to be ifuncs.
I'm aware of the great irony of committing code to a security project
with a log message saying something vague about ifuncs, on the same
weekend that it came to light that commits matching that description
were one of the methods used to smuggle a backdoor into the XZ Utils
project (CVE-2024-3094). So I'll bend over backwards to explain both
what I think is going on, and why this _isn't_ a weird ifunc-related
backdooring attempt:
When I say I 'wrap' memset, I mean I use DynamoRIO's 'drwrap' API to
arrange that the side-channel test rig calls a function of mine before
and after each call to memset. The way drwrap works is to look up the
symbol address in either the main program or a shared library; in this
case, it's a shared library, namely libc.so. Then it intercepts call
instructions with exactly that address as the target.
Unfortunately, what _actually_ happens when the main program calls
memset is more complicated. First, control goes to the PLT entry for
memset (still in the main program). In principle, that loads a GOT
entry containing the address of memset (filled in by ld.so), and jumps
to it. But in fact the GOT entry varies its value through the program;
on the first call, it points to a resolver function, whose job is to
_find out_ the address of memset. And in the version of libc.so I'm
currently running, that resolver is an STT_GNU_IFUNC indirection
function, which tests the host CPU's capabilities, and chooses an
actual implementation of memset depending on what it finds. (In my
case, it looks as if it's picking one that makes extensive use of x86
SIMD.) To avoid the overhead of doing this on every call, the returned
function pointer is then written into the main program's GOT entry for
memset, overwriting the address of the resolver function, so that the
_next_ call the main program makes through the same PLT entry will go
directly to the memset variant that was chosen.
And the problem is that, after this has happened, none of the new
control flow ever goes near the _official_ address of memset, as read
out of libc.so's dynamic symbol table by DynamoRIO. The PLT entry
isn't at that address, and neither is the particular SIMD variant that
the resolver ended up choosing. So now my wrapper on memset is never
being invoked, and memset cheerfully generates different control flow
in runs of my crypto code that testsc expects to be doing exactly the
same thing as each other, and all my tests fail spuriously.
My solution, at least for the moment, is to completely abandon the
strategy of wrapping memset. Instead, let's just make it behave the
same way every time, by forcing all the affected memory allocations to
have extra-strict alignment. I found that 64-byte alignment is not
good enough to eliminate memset-related test failures, but 128-byte
alignment is.
This would be tricky in itself, if it weren't for the fact that PuTTY
already has its own wrapper function on malloc (for various reasons),
which everything in our code already uses. So I can divert to C11's
aligned_alloc() there. That in turn is done by adding a new #ifdef to
utils/memory.c, and compiling it with that #ifdef into a new object
library that is included in testsc, superseding the standard memory.o
that would otherwise be pulled in from our 'utils' static library.
With the previous memset-compensator removed, this means testsc is now
dependent on having aligned_alloc() available. So we test for it at
cmake time, and don't build testsc at all if it can't be found. This
shouldn't bother anyone very much; aligned_alloc() is available on
_my_ testsc platform, and if anyone else is trying to run this test
suite at all, I expect it will be on something at least as new as
that.
(One awkward thing here is that we can only replace _new_ allocations
with calls to aligned_alloc(): C11 provides no aligned version of
realloc. Happily, this doesn't currently introduce any new problems in
testsc. If it does, I might have to do something even more painful in
future.)
So, why isn't this an ifunc-related backdoor attempt? Because (and you
can check all of this from the patch):
1. The memset-wrapping code exists entirely within the DynamoRIO
plugin module that lives in test/sclog. That is not used in
production, only for running the 'testsc' side-channel tester.
2. The memset-wrapping code is _removed_ by this patch, not added.
3. None of this code is dealing directly with ifuncs - only working
around the unwanted effects on my test suite from the fact that
they exist somewhere else and introduce awkward behaviour.
2024-04-01 07:48:36 +00:00
|
|
|
static void wrap_aligned_alloc_pre(void *wrapctx, void **user_data)
|
|
|
|
{
|
|
|
|
logging_paused++;
|
|
|
|
size_t align = (size_t) drwrap_get_arg(wrapctx, 0);
|
|
|
|
*user_data = drwrap_get_arg(wrapctx, 1);
|
|
|
|
dr_fprintf(outfile, "aligned_alloc align=%zu size=%"PRIuMAX"\n",
|
|
|
|
align, (uintmax_t)*user_data);
|
|
|
|
}
|
New test system to detect side channels in crypto code.
All the work I've put in in the last few months to eliminate timing
and cache side channels from PuTTY's mp_int and cipher implementations
has been on a seat-of-the-pants basis: just thinking very hard about
what kinds of language construction I think would be safe to use, and
trying not to absentmindedly leave a conditional branch or a cast to
bool somewhere vital.
Now I've got a test suite! The basic idea is that you run the same
crypto primitive multiple times, with inputs differing only in ways
that are supposed to avoid being leaked by timing or leaving evidence
in the cache; then you instrument the code so that it logs all the
control flow, memory access and a couple of other relevant things in
each of those runs, and finally, compare the logs and expect them to
be identical.
The instrumentation is done using DynamoRIO, which I found to be well
suited to this kind of work: it lets you define custom modifications
of the code in a reasonably low-effort way, and it lets you work at
both the low level of examining single instructions _and_ the higher
level of the function call ABI (so you can give things like malloc
special treatment, not to mention intercepting communications from the
program being instrumented). Build instructions are all in the comment
at the top of testsc.c.
At present, I've found this test to give a 100% pass rate using gcc
-O0 and -O3 (Ubuntu 18.10). With clang, there are a couple of
failures, which I'll fix in the next commit.
2019-02-10 13:09:53 +00:00
|
|
|
static void wrap_free_pre(void *wrapctx, void **user_data)
|
|
|
|
{
|
|
|
|
logging_paused++;
|
|
|
|
void *ptr = drwrap_get_arg(wrapctx, 0);
|
|
|
|
freed(ptr);
|
|
|
|
}
|
|
|
|
static void wrap_realloc_pre(void *wrapctx, void **user_data)
|
|
|
|
{
|
|
|
|
logging_paused++;
|
|
|
|
void *ptr = drwrap_get_arg(wrapctx, 0);
|
|
|
|
freed(ptr);
|
|
|
|
*user_data = drwrap_get_arg(wrapctx, 1);
|
2020-12-13 12:31:42 +00:00
|
|
|
dr_fprintf(outfile, "realloc %"PRIuMAX"\n", (uintmax_t)*user_data);
|
New test system to detect side channels in crypto code.
All the work I've put in in the last few months to eliminate timing
and cache side channels from PuTTY's mp_int and cipher implementations
has been on a seat-of-the-pants basis: just thinking very hard about
what kinds of language construction I think would be safe to use, and
trying not to absentmindedly leave a conditional branch or a cast to
bool somewhere vital.
Now I've got a test suite! The basic idea is that you run the same
crypto primitive multiple times, with inputs differing only in ways
that are supposed to avoid being leaked by timing or leaving evidence
in the cache; then you instrument the code so that it logs all the
control flow, memory access and a couple of other relevant things in
each of those runs, and finally, compare the logs and expect them to
be identical.
The instrumentation is done using DynamoRIO, which I found to be well
suited to this kind of work: it lets you define custom modifications
of the code in a reasonably low-effort way, and it lets you work at
both the low level of examining single instructions _and_ the higher
level of the function call ABI (so you can give things like malloc
special treatment, not to mention intercepting communications from the
program being instrumented). Build instructions are all in the comment
at the top of testsc.c.
At present, I've found this test to give a 100% pass rate using gcc
-O0 and -O3 (Ubuntu 18.10). With clang, there are a couple of
failures, which I'll fix in the next commit.
2019-02-10 13:09:53 +00:00
|
|
|
}
|
|
|
|
static void wrap_alloc_post(void *wrapctx, void *user_data)
|
|
|
|
{
|
|
|
|
void *ptr = drwrap_get_retval(wrapctx);
|
|
|
|
if (!ptr)
|
|
|
|
return;
|
|
|
|
size_t size = (size_t)user_data;
|
|
|
|
allocated(ptr, size);
|
|
|
|
logging_paused--;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
Side-channel tester: align memory allocations.
While trying to get an upcoming piece of code through testsc, I had
trouble - _yet again_ - with the way that control flow diverges inside
the glibc implementations of functions like memcpy and memset,
depending on the alignment of the input blocks _above_ the alignment
guaranteed by malloc, so that doing the same sequence of malloc +
memset can lead to different control flow. (I believe this is done
either for cache performance reasons or SIMD alignment requirements,
or both: on x86, some SIMD instructions require memory alignment
beyond what malloc guarantees, which is also awkward for our x86
hardware crypto implementations.)
My previous effort to normalise this problem out of sclog's log files
worked by wrapping memset and all its synonyms that I could find. But
this weekend, that failed for me, and the reason appears to be ifuncs.
I'm aware of the great irony of committing code to a security project
with a log message saying something vague about ifuncs, on the same
weekend that it came to light that commits matching that description
were one of the methods used to smuggle a backdoor into the XZ Utils
project (CVE-2024-3094). So I'll bend over backwards to explain both
what I think is going on, and why this _isn't_ a weird ifunc-related
backdooring attempt:
When I say I 'wrap' memset, I mean I use DynamoRIO's 'drwrap' API to
arrange that the side-channel test rig calls a function of mine before
and after each call to memset. The way drwrap works is to look up the
symbol address in either the main program or a shared library; in this
case, it's a shared library, namely libc.so. Then it intercepts call
instructions with exactly that address as the target.
Unfortunately, what _actually_ happens when the main program calls
memset is more complicated. First, control goes to the PLT entry for
memset (still in the main program). In principle, that loads a GOT
entry containing the address of memset (filled in by ld.so), and jumps
to it. But in fact the GOT entry varies its value through the program;
on the first call, it points to a resolver function, whose job is to
_find out_ the address of memset. And in the version of libc.so I'm
currently running, that resolver is an STT_GNU_IFUNC indirection
function, which tests the host CPU's capabilities, and chooses an
actual implementation of memset depending on what it finds. (In my
case, it looks as if it's picking one that makes extensive use of x86
SIMD.) To avoid the overhead of doing this on every call, the returned
function pointer is then written into the main program's GOT entry for
memset, overwriting the address of the resolver function, so that the
_next_ call the main program makes through the same PLT entry will go
directly to the memset variant that was chosen.
And the problem is that, after this has happened, none of the new
control flow ever goes near the _official_ address of memset, as read
out of libc.so's dynamic symbol table by DynamoRIO. The PLT entry
isn't at that address, and neither is the particular SIMD variant that
the resolver ended up choosing. So now my wrapper on memset is never
being invoked, and memset cheerfully generates different control flow
in runs of my crypto code that testsc expects to be doing exactly the
same thing as each other, and all my tests fail spuriously.
My solution, at least for the moment, is to completely abandon the
strategy of wrapping memset. Instead, let's just make it behave the
same way every time, by forcing all the affected memory allocations to
have extra-strict alignment. I found that 64-byte alignment is not
good enough to eliminate memset-related test failures, but 128-byte
alignment is.
This would be tricky in itself, if it weren't for the fact that PuTTY
already has its own wrapper function on malloc (for various reasons),
which everything in our code already uses. So I can divert to C11's
aligned_alloc() there. That in turn is done by adding a new #ifdef to
utils/memory.c, and compiling it with that #ifdef into a new object
library that is included in testsc, superseding the standard memory.o
that would otherwise be pulled in from our 'utils' static library.
With the previous memset-compensator removed, this means testsc is now
dependent on having aligned_alloc() available. So we test for it at
cmake time, and don't build testsc at all if it can't be found. This
shouldn't bother anyone very much; aligned_alloc() is available on
_my_ testsc platform, and if anyone else is trying to run this test
suite at all, I expect it will be on something at least as new as
that.
(One awkward thing here is that we can only replace _new_ allocations
with calls to aligned_alloc(): C11 provides no aligned version of
realloc. Happily, this doesn't currently introduce any new problems in
testsc. If it does, I might have to do something even more painful in
future.)
So, why isn't this an ifunc-related backdoor attempt? Because (and you
can check all of this from the patch):
1. The memset-wrapping code exists entirely within the DynamoRIO
plugin module that lives in test/sclog. That is not used in
production, only for running the 'testsc' side-channel tester.
2. The memset-wrapping code is _removed_ by this patch, not added.
3. None of this code is dealing directly with ifuncs - only working
around the unwanted effects on my test suite from the fact that
they exist somewhere else and introduce awkward behaviour.
2024-04-01 07:48:36 +00:00
|
|
|
* Common post-wrapper function to unpause the logging.
|
New test system to detect side channels in crypto code.
All the work I've put in in the last few months to eliminate timing
and cache side channels from PuTTY's mp_int and cipher implementations
has been on a seat-of-the-pants basis: just thinking very hard about
what kinds of language construction I think would be safe to use, and
trying not to absentmindedly leave a conditional branch or a cast to
bool somewhere vital.
Now I've got a test suite! The basic idea is that you run the same
crypto primitive multiple times, with inputs differing only in ways
that are supposed to avoid being leaked by timing or leaving evidence
in the cache; then you instrument the code so that it logs all the
control flow, memory access and a couple of other relevant things in
each of those runs, and finally, compare the logs and expect them to
be identical.
The instrumentation is done using DynamoRIO, which I found to be well
suited to this kind of work: it lets you define custom modifications
of the code in a reasonably low-effort way, and it lets you work at
both the low level of examining single instructions _and_ the higher
level of the function call ABI (so you can give things like malloc
special treatment, not to mention intercepting communications from the
program being instrumented). Build instructions are all in the comment
at the top of testsc.c.
At present, I've found this test to give a 100% pass rate using gcc
-O0 and -O3 (Ubuntu 18.10). With clang, there are a couple of
failures, which I'll fix in the next commit.
2019-02-10 13:09:53 +00:00
|
|
|
*/
|
|
|
|
static void unpause_post(void *wrapctx, void *user_data)
|
|
|
|
{
|
|
|
|
logging_paused--;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make a string representation of the address of an instruction,
|
|
|
|
* including a function name and/or a file+line combination if
|
|
|
|
* possible. These will be logged alongside every act of interest
|
|
|
|
* where we can make one.
|
|
|
|
*/
|
|
|
|
static void instr_format_location(instr_t *instr, char **outloc)
|
|
|
|
{
|
|
|
|
app_pc addr = (app_pc)instr_get_app_pc(instr);
|
|
|
|
char location[2048], symbol[512], fileline[1024];
|
|
|
|
bool got_sym = false, got_line = false;
|
|
|
|
|
|
|
|
if (*outloc)
|
|
|
|
return;
|
|
|
|
|
|
|
|
symbol[0] = '\0';
|
|
|
|
fileline[0] = '\0';
|
|
|
|
|
|
|
|
module_data_t *data = dr_lookup_module(addr);
|
|
|
|
if (data) {
|
|
|
|
drsym_info_t sym;
|
|
|
|
char file[MAXIMUM_PATH];
|
|
|
|
|
|
|
|
sym.struct_size = sizeof(sym);
|
|
|
|
sym.name = symbol;
|
|
|
|
sym.name_size = sizeof(symbol);
|
|
|
|
sym.file = file;
|
|
|
|
sym.file_size = sizeof(file);
|
|
|
|
|
|
|
|
drsym_error_t status = drsym_lookup_address(
|
|
|
|
data->full_path, addr - data->start, &sym, DRSYM_DEFAULT_FLAGS);
|
|
|
|
|
|
|
|
got_line = (status == DRSYM_SUCCESS);
|
|
|
|
got_sym = got_line || status == DRSYM_ERROR_LINE_NOT_AVAILABLE;
|
|
|
|
|
|
|
|
if (got_line)
|
|
|
|
snprintf(fileline, sizeof(fileline), " = %s:%"PRIu64,
|
|
|
|
file, (uint64_t)sym.line);
|
|
|
|
}
|
|
|
|
|
|
|
|
snprintf(location, sizeof(location),
|
|
|
|
"%"PRIx64"%s%s%s",
|
|
|
|
(uint64_t)addr, got_sym ? " = " : "", got_sym ? symbol : "",
|
|
|
|
fileline);
|
|
|
|
size_t len = strlen(location) + 1;
|
|
|
|
char *loc = dr_global_alloc(len);
|
|
|
|
memcpy(loc, location, len);
|
|
|
|
*outloc = loc;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Function that tests a single operand of an instruction to see if
|
|
|
|
* it's a memory reference, and if so, adds a call to log_mem.
|
|
|
|
*/
|
|
|
|
static void try_mem_opnd(
|
|
|
|
void *drcontext, instrlist_t *bb, instr_t *instr, char **loc,
|
|
|
|
opnd_t opnd, bool write)
|
|
|
|
{
|
|
|
|
if (!opnd_is_memory_reference(opnd))
|
|
|
|
return;
|
|
|
|
|
|
|
|
instr_format_location(instr, loc);
|
|
|
|
|
|
|
|
reg_id_t r0, r1;
|
|
|
|
drreg_status_t st;
|
|
|
|
st = drreg_reserve_register(drcontext, bb, instr, NULL, &r0);
|
|
|
|
DR_ASSERT(st == DRREG_SUCCESS);
|
|
|
|
st = drreg_reserve_register(drcontext, bb, instr, NULL, &r1);
|
|
|
|
DR_ASSERT(st == DRREG_SUCCESS);
|
|
|
|
|
|
|
|
bool ok = drutil_insert_get_mem_addr(drcontext, bb, instr, opnd, r0, r1);
|
|
|
|
DR_ASSERT(ok);
|
|
|
|
|
|
|
|
uint size = drutil_opnd_mem_size_in_bytes(opnd, instr);
|
|
|
|
|
|
|
|
dr_insert_clean_call(
|
|
|
|
drcontext, bb, instr, (void *)log_mem, false,
|
|
|
|
4, opnd_create_reg(r0), OPND_CREATE_INT32(size),
|
|
|
|
OPND_CREATE_INT32(write), OPND_CREATE_INTPTR(*loc));
|
|
|
|
|
|
|
|
st = drreg_unreserve_register(drcontext, bb, instr, r1);
|
|
|
|
DR_ASSERT(st == DRREG_SUCCESS);
|
|
|
|
st = drreg_unreserve_register(drcontext, bb, instr, r0);
|
|
|
|
DR_ASSERT(st == DRREG_SUCCESS);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The main function called to instrument each machine instruction.
|
|
|
|
*/
|
|
|
|
static dr_emit_flags_t instrument_instr(
|
|
|
|
void *drcontext, void *tag, instrlist_t *bb, instr_t *instr,
|
|
|
|
bool for_trace, bool translating, void *user_data)
|
|
|
|
{
|
|
|
|
char *loc = NULL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If this instruction is the first in its basic block, call
|
|
|
|
* log_pc to record that we're executing this block at all.
|
|
|
|
*/
|
|
|
|
if (drmgr_is_first_instr(drcontext, instr)) {
|
|
|
|
instr_format_location(instr, &loc);
|
|
|
|
dr_insert_clean_call(
|
|
|
|
drcontext, bb, instr, (void *)log_pc, false,
|
|
|
|
1, OPND_CREATE_INTPTR(loc));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the instruction reads or writes memory, log its access.
|
|
|
|
*/
|
|
|
|
if (instr_reads_memory(instr) || instr_writes_memory(instr)) {
|
|
|
|
for (int i = 0, limit = instr_num_srcs(instr); i < limit; i++)
|
|
|
|
try_mem_opnd(drcontext, bb, instr, &loc,
|
2019-12-15 10:57:52 +00:00
|
|
|
instr_get_src(instr, i), instr_writes_memory(instr));
|
New test system to detect side channels in crypto code.
All the work I've put in in the last few months to eliminate timing
and cache side channels from PuTTY's mp_int and cipher implementations
has been on a seat-of-the-pants basis: just thinking very hard about
what kinds of language construction I think would be safe to use, and
trying not to absentmindedly leave a conditional branch or a cast to
bool somewhere vital.
Now I've got a test suite! The basic idea is that you run the same
crypto primitive multiple times, with inputs differing only in ways
that are supposed to avoid being leaked by timing or leaving evidence
in the cache; then you instrument the code so that it logs all the
control flow, memory access and a couple of other relevant things in
each of those runs, and finally, compare the logs and expect them to
be identical.
The instrumentation is done using DynamoRIO, which I found to be well
suited to this kind of work: it lets you define custom modifications
of the code in a reasonably low-effort way, and it lets you work at
both the low level of examining single instructions _and_ the higher
level of the function call ABI (so you can give things like malloc
special treatment, not to mention intercepting communications from the
program being instrumented). Build instructions are all in the comment
at the top of testsc.c.
At present, I've found this test to give a 100% pass rate using gcc
-O0 and -O3 (Ubuntu 18.10). With clang, there are a couple of
failures, which I'll fix in the next commit.
2019-02-10 13:09:53 +00:00
|
|
|
for (int i = 0, limit = instr_num_dsts(instr); i < limit; i++)
|
|
|
|
try_mem_opnd(drcontext, bb, instr, &loc,
|
2019-12-15 10:57:52 +00:00
|
|
|
instr_get_dst(instr, i), instr_writes_memory(instr));
|
New test system to detect side channels in crypto code.
All the work I've put in in the last few months to eliminate timing
and cache side channels from PuTTY's mp_int and cipher implementations
has been on a seat-of-the-pants basis: just thinking very hard about
what kinds of language construction I think would be safe to use, and
trying not to absentmindedly leave a conditional branch or a cast to
bool somewhere vital.
Now I've got a test suite! The basic idea is that you run the same
crypto primitive multiple times, with inputs differing only in ways
that are supposed to avoid being leaked by timing or leaving evidence
in the cache; then you instrument the code so that it logs all the
control flow, memory access and a couple of other relevant things in
each of those runs, and finally, compare the logs and expect them to
be identical.
The instrumentation is done using DynamoRIO, which I found to be well
suited to this kind of work: it lets you define custom modifications
of the code in a reasonably low-effort way, and it lets you work at
both the low level of examining single instructions _and_ the higher
level of the function call ABI (so you can give things like malloc
special treatment, not to mention intercepting communications from the
program being instrumented). Build instructions are all in the comment
at the top of testsc.c.
At present, I've found this test to give a 100% pass rate using gcc
-O0 and -O3 (Ubuntu 18.10). With clang, there are a couple of
failures, which I'll fix in the next commit.
2019-02-10 13:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now do opcode-specific checks.
|
|
|
|
*/
|
|
|
|
int opcode = instr_get_opcode(instr);
|
|
|
|
|
|
|
|
switch (opcode) {
|
2020-11-26 17:52:11 +00:00
|
|
|
#if defined(X86)
|
New test system to detect side channels in crypto code.
All the work I've put in in the last few months to eliminate timing
and cache side channels from PuTTY's mp_int and cipher implementations
has been on a seat-of-the-pants basis: just thinking very hard about
what kinds of language construction I think would be safe to use, and
trying not to absentmindedly leave a conditional branch or a cast to
bool somewhere vital.
Now I've got a test suite! The basic idea is that you run the same
crypto primitive multiple times, with inputs differing only in ways
that are supposed to avoid being leaked by timing or leaving evidence
in the cache; then you instrument the code so that it logs all the
control flow, memory access and a couple of other relevant things in
each of those runs, and finally, compare the logs and expect them to
be identical.
The instrumentation is done using DynamoRIO, which I found to be well
suited to this kind of work: it lets you define custom modifications
of the code in a reasonably low-effort way, and it lets you work at
both the low level of examining single instructions _and_ the higher
level of the function call ABI (so you can give things like malloc
special treatment, not to mention intercepting communications from the
program being instrumented). Build instructions are all in the comment
at the top of testsc.c.
At present, I've found this test to give a 100% pass rate using gcc
-O0 and -O3 (Ubuntu 18.10). With clang, there are a couple of
failures, which I'll fix in the next commit.
2019-02-10 13:09:53 +00:00
|
|
|
case OP_div:
|
|
|
|
case OP_idiv:
|
|
|
|
/*
|
|
|
|
* x86 hardware divisions. The operand order for DR's
|
|
|
|
* representation of these seem to be: 0 = denominator, 1 =
|
|
|
|
* numerator MSW, 2 = numerator LSW.
|
|
|
|
*/
|
|
|
|
instr_format_location(instr, &loc);
|
|
|
|
dr_insert_clean_call(
|
|
|
|
drcontext, bb, instr, (void *)log_div, false,
|
|
|
|
3, instr_get_src(instr, 2), instr_get_src(instr, 0),
|
|
|
|
OPND_CREATE_INTPTR(loc));
|
|
|
|
break;
|
2020-11-26 17:52:11 +00:00
|
|
|
#endif
|
2020-11-26 17:53:50 +00:00
|
|
|
#if defined(AARCH64)
|
|
|
|
case OP_sdiv:
|
|
|
|
case OP_udiv:
|
|
|
|
/*
|
|
|
|
* AArch64 hardware divisions. 0 = numerator, 1 = denominator.
|
|
|
|
*/
|
|
|
|
instr_format_location(instr, &loc);
|
|
|
|
dr_insert_clean_call(
|
|
|
|
drcontext, bb, instr, (void *)log_div, false,
|
|
|
|
3, instr_get_src(instr, 0), instr_get_src(instr, 1),
|
|
|
|
OPND_CREATE_INTPTR(loc));
|
|
|
|
break;
|
|
|
|
#endif
|
2020-11-26 17:52:11 +00:00
|
|
|
#if defined(X86)
|
New test system to detect side channels in crypto code.
All the work I've put in in the last few months to eliminate timing
and cache side channels from PuTTY's mp_int and cipher implementations
has been on a seat-of-the-pants basis: just thinking very hard about
what kinds of language construction I think would be safe to use, and
trying not to absentmindedly leave a conditional branch or a cast to
bool somewhere vital.
Now I've got a test suite! The basic idea is that you run the same
crypto primitive multiple times, with inputs differing only in ways
that are supposed to avoid being leaked by timing or leaving evidence
in the cache; then you instrument the code so that it logs all the
control flow, memory access and a couple of other relevant things in
each of those runs, and finally, compare the logs and expect them to
be identical.
The instrumentation is done using DynamoRIO, which I found to be well
suited to this kind of work: it lets you define custom modifications
of the code in a reasonably low-effort way, and it lets you work at
both the low level of examining single instructions _and_ the higher
level of the function call ABI (so you can give things like malloc
special treatment, not to mention intercepting communications from the
program being instrumented). Build instructions are all in the comment
at the top of testsc.c.
At present, I've found this test to give a 100% pass rate using gcc
-O0 and -O3 (Ubuntu 18.10). With clang, there are a couple of
failures, which I'll fix in the next commit.
2019-02-10 13:09:53 +00:00
|
|
|
case OP_shl:
|
|
|
|
case OP_shr:
|
|
|
|
case OP_sar:
|
|
|
|
case OP_shlx:
|
|
|
|
case OP_shrx:
|
|
|
|
case OP_sarx:
|
|
|
|
case OP_rol:
|
|
|
|
case OP_ror:
|
|
|
|
case OP_rcl:
|
Formatting change to braces around one case of a switch.
Sometimes, within a switch statement, you want to declare local
variables specific to the handler for one particular case. Until now
I've mostly been writing this in the form
switch (discriminant) {
case SIMPLE:
do stuff;
break;
case COMPLICATED:
{
declare variables;
do stuff;
}
break;
}
which is ugly because the two pieces of essentially similar code
appear at different indent levels, and also inconvenient because you
have less horizontal space available to write the complicated case
handler in - particuarly undesirable because _complicated_ case
handlers are the ones most likely to need all the space they can get!
After encountering a rather nicer idiom in the LLVM source code, and
after a bit of hackery this morning figuring out how to persuade
Emacs's auto-indent to do what I wanted with it, I've decided to move
to an idiom in which the open brace comes right after the case
statement, and the code within it is indented the same as it would
have been without the brace. Then the whole case handler (including
the break) lives inside those braces, and you get something that looks
more like this:
switch (discriminant) {
case SIMPLE:
do stuff;
break;
case COMPLICATED: {
declare variables;
do stuff;
break;
}
}
This commit is a big-bang change that reformats all the complicated
case handlers I could find into the new layout. This is particularly
nice in the Pageant main function, in which almost _every_ case
handler had a bundle of variables and was long and complicated. (In
fact that's what motivated me to get round to this.) Some of the
innermost parts of the terminal escape-sequence handling are also
breathing a bit easier now the horizontal pressure on them is
relieved.
(Also, in a few cases, I was able to remove the extra braces
completely, because the only variable local to the case handler was a
loop variable which our new C99 policy allows me to move into the
initialiser clause of its for statement.)
Viewed with whitespace ignored, this is not too disruptive a change.
Downstream patches that conflict with it may need to be reapplied
using --ignore-whitespace or similar.
2020-02-16 07:49:52 +00:00
|
|
|
case OP_rcr: {
|
New test system to detect side channels in crypto code.
All the work I've put in in the last few months to eliminate timing
and cache side channels from PuTTY's mp_int and cipher implementations
has been on a seat-of-the-pants basis: just thinking very hard about
what kinds of language construction I think would be safe to use, and
trying not to absentmindedly leave a conditional branch or a cast to
bool somewhere vital.
Now I've got a test suite! The basic idea is that you run the same
crypto primitive multiple times, with inputs differing only in ways
that are supposed to avoid being leaked by timing or leaving evidence
in the cache; then you instrument the code so that it logs all the
control flow, memory access and a couple of other relevant things in
each of those runs, and finally, compare the logs and expect them to
be identical.
The instrumentation is done using DynamoRIO, which I found to be well
suited to this kind of work: it lets you define custom modifications
of the code in a reasonably low-effort way, and it lets you work at
both the low level of examining single instructions _and_ the higher
level of the function call ABI (so you can give things like malloc
special treatment, not to mention intercepting communications from the
program being instrumented). Build instructions are all in the comment
at the top of testsc.c.
At present, I've found this test to give a 100% pass rate using gcc
-O0 and -O3 (Ubuntu 18.10). With clang, there are a couple of
failures, which I'll fix in the next commit.
2019-02-10 13:09:53 +00:00
|
|
|
/*
|
|
|
|
* Shift instructions. If they're register-controlled, log the
|
|
|
|
* shift count.
|
|
|
|
*/
|
Formatting change to braces around one case of a switch.
Sometimes, within a switch statement, you want to declare local
variables specific to the handler for one particular case. Until now
I've mostly been writing this in the form
switch (discriminant) {
case SIMPLE:
do stuff;
break;
case COMPLICATED:
{
declare variables;
do stuff;
}
break;
}
which is ugly because the two pieces of essentially similar code
appear at different indent levels, and also inconvenient because you
have less horizontal space available to write the complicated case
handler in - particuarly undesirable because _complicated_ case
handlers are the ones most likely to need all the space they can get!
After encountering a rather nicer idiom in the LLVM source code, and
after a bit of hackery this morning figuring out how to persuade
Emacs's auto-indent to do what I wanted with it, I've decided to move
to an idiom in which the open brace comes right after the case
statement, and the code within it is indented the same as it would
have been without the brace. Then the whole case handler (including
the break) lives inside those braces, and you get something that looks
more like this:
switch (discriminant) {
case SIMPLE:
do stuff;
break;
case COMPLICATED: {
declare variables;
do stuff;
break;
}
}
This commit is a big-bang change that reformats all the complicated
case handlers I could find into the new layout. This is particularly
nice in the Pageant main function, in which almost _every_ case
handler had a bundle of variables and was long and complicated. (In
fact that's what motivated me to get round to this.) Some of the
innermost parts of the terminal escape-sequence handling are also
breathing a bit easier now the horizontal pressure on them is
relieved.
(Also, in a few cases, I was able to remove the extra braces
completely, because the only variable local to the case handler was a
loop variable which our new C99 policy allows me to move into the
initialiser clause of its for statement.)
Viewed with whitespace ignored, this is not too disruptive a change.
Downstream patches that conflict with it may need to be reapplied
using --ignore-whitespace or similar.
2020-02-16 07:49:52 +00:00
|
|
|
opnd_t shiftcount = instr_get_src(instr, 0);
|
|
|
|
if (!opnd_is_immed(shiftcount)) {
|
2022-08-03 19:48:46 +00:00
|
|
|
reg_id_t r0;
|
|
|
|
drreg_status_t st;
|
|
|
|
st = drreg_reserve_register(drcontext, bb, instr, NULL, &r0);
|
|
|
|
DR_ASSERT(st == DRREG_SUCCESS);
|
|
|
|
opnd_t op_r0 = opnd_create_reg(r0);
|
|
|
|
instr_t *movzx = INSTR_CREATE_movzx(drcontext, op_r0, shiftcount);
|
|
|
|
instr_set_translation(movzx, instr_get_app_pc(instr));
|
|
|
|
instrlist_preinsert(bb, instr, movzx);
|
|
|
|
instr_format_location(instr, &loc);
|
|
|
|
dr_insert_clean_call(
|
|
|
|
drcontext, bb, instr, (void *)log_var_shift, false,
|
|
|
|
2, op_r0, OPND_CREATE_INTPTR(loc));
|
|
|
|
st = drreg_unreserve_register(drcontext, bb, instr, r0);
|
|
|
|
DR_ASSERT(st == DRREG_SUCCESS);
|
New test system to detect side channels in crypto code.
All the work I've put in in the last few months to eliminate timing
and cache side channels from PuTTY's mp_int and cipher implementations
has been on a seat-of-the-pants basis: just thinking very hard about
what kinds of language construction I think would be safe to use, and
trying not to absentmindedly leave a conditional branch or a cast to
bool somewhere vital.
Now I've got a test suite! The basic idea is that you run the same
crypto primitive multiple times, with inputs differing only in ways
that are supposed to avoid being leaked by timing or leaving evidence
in the cache; then you instrument the code so that it logs all the
control flow, memory access and a couple of other relevant things in
each of those runs, and finally, compare the logs and expect them to
be identical.
The instrumentation is done using DynamoRIO, which I found to be well
suited to this kind of work: it lets you define custom modifications
of the code in a reasonably low-effort way, and it lets you work at
both the low level of examining single instructions _and_ the higher
level of the function call ABI (so you can give things like malloc
special treatment, not to mention intercepting communications from the
program being instrumented). Build instructions are all in the comment
at the top of testsc.c.
At present, I've found this test to give a 100% pass rate using gcc
-O0 and -O3 (Ubuntu 18.10). With clang, there are a couple of
failures, which I'll fix in the next commit.
2019-02-10 13:09:53 +00:00
|
|
|
}
|
|
|
|
break;
|
Formatting change to braces around one case of a switch.
Sometimes, within a switch statement, you want to declare local
variables specific to the handler for one particular case. Until now
I've mostly been writing this in the form
switch (discriminant) {
case SIMPLE:
do stuff;
break;
case COMPLICATED:
{
declare variables;
do stuff;
}
break;
}
which is ugly because the two pieces of essentially similar code
appear at different indent levels, and also inconvenient because you
have less horizontal space available to write the complicated case
handler in - particuarly undesirable because _complicated_ case
handlers are the ones most likely to need all the space they can get!
After encountering a rather nicer idiom in the LLVM source code, and
after a bit of hackery this morning figuring out how to persuade
Emacs's auto-indent to do what I wanted with it, I've decided to move
to an idiom in which the open brace comes right after the case
statement, and the code within it is indented the same as it would
have been without the brace. Then the whole case handler (including
the break) lives inside those braces, and you get something that looks
more like this:
switch (discriminant) {
case SIMPLE:
do stuff;
break;
case COMPLICATED: {
declare variables;
do stuff;
break;
}
}
This commit is a big-bang change that reformats all the complicated
case handlers I could find into the new layout. This is particularly
nice in the Pageant main function, in which almost _every_ case
handler had a bundle of variables and was long and complicated. (In
fact that's what motivated me to get round to this.) Some of the
innermost parts of the terminal escape-sequence handling are also
breathing a bit easier now the horizontal pressure on them is
relieved.
(Also, in a few cases, I was able to remove the extra braces
completely, because the only variable local to the case handler was a
loop variable which our new C99 policy allows me to move into the
initialiser clause of its for statement.)
Viewed with whitespace ignored, this is not too disruptive a change.
Downstream patches that conflict with it may need to be reapplied
using --ignore-whitespace or similar.
2020-02-16 07:49:52 +00:00
|
|
|
}
|
2020-11-26 17:53:50 +00:00
|
|
|
#endif
|
|
|
|
#if defined(AARCH64)
|
|
|
|
case OP_lslv:
|
|
|
|
case OP_asrv:
|
|
|
|
case OP_lsrv:
|
|
|
|
case OP_rorv: {
|
|
|
|
/*
|
|
|
|
* AArch64 variable shift instructions.
|
|
|
|
*/
|
|
|
|
opnd_t shiftcount = instr_get_src(instr, 1);
|
|
|
|
DR_ASSERT(opnd_is_reg(shiftcount));
|
|
|
|
reg_id_t shiftreg = opnd_get_reg(shiftcount);
|
|
|
|
if (shiftreg >= DR_REG_W0 && shiftreg <= DR_REG_WSP)
|
|
|
|
shiftreg = reg_32_to_64(shiftreg);
|
|
|
|
instr_format_location(instr, &loc);
|
|
|
|
dr_insert_clean_call(
|
|
|
|
drcontext, bb, instr, (void *)log_var_shift, false,
|
|
|
|
2, opnd_create_reg(shiftreg), OPND_CREATE_INTPTR(loc));
|
|
|
|
break;
|
|
|
|
}
|
2020-11-26 17:52:11 +00:00
|
|
|
#endif
|
New test system to detect side channels in crypto code.
All the work I've put in in the last few months to eliminate timing
and cache side channels from PuTTY's mp_int and cipher implementations
has been on a seat-of-the-pants basis: just thinking very hard about
what kinds of language construction I think would be safe to use, and
trying not to absentmindedly leave a conditional branch or a cast to
bool somewhere vital.
Now I've got a test suite! The basic idea is that you run the same
crypto primitive multiple times, with inputs differing only in ways
that are supposed to avoid being leaked by timing or leaving evidence
in the cache; then you instrument the code so that it logs all the
control flow, memory access and a couple of other relevant things in
each of those runs, and finally, compare the logs and expect them to
be identical.
The instrumentation is done using DynamoRIO, which I found to be well
suited to this kind of work: it lets you define custom modifications
of the code in a reasonably low-effort way, and it lets you work at
both the low level of examining single instructions _and_ the higher
level of the function call ABI (so you can give things like malloc
special treatment, not to mention intercepting communications from the
program being instrumented). Build instructions are all in the comment
at the top of testsc.c.
At present, I've found this test to give a 100% pass rate using gcc
-O0 and -O3 (Ubuntu 18.10). With clang, there are a couple of
failures, which I'll fix in the next commit.
2019-02-10 13:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return DR_EMIT_DEFAULT;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void exit_event(void)
|
|
|
|
{
|
|
|
|
if (outfile != INVALID_FILE) {
|
|
|
|
dr_fprintf(outfile, "exit while recording enabled\n");
|
|
|
|
dr_close_file(outfile);
|
|
|
|
outfile = INVALID_FILE;
|
|
|
|
}
|
|
|
|
drsym_exit();
|
|
|
|
drreg_exit();
|
|
|
|
drwrap_exit();
|
|
|
|
drutil_exit();
|
|
|
|
drmgr_exit();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We ask DR to expand any x86 string instructions like REP MOVSB, so
|
|
|
|
* that we can log all the individual memory accesses without getting
|
|
|
|
* confused.
|
|
|
|
*/
|
|
|
|
static dr_emit_flags_t expand_rep_movsb(
|
|
|
|
void *drcontext, void *tag, instrlist_t *bb, bool for_trace,
|
|
|
|
bool translating)
|
|
|
|
{
|
|
|
|
bool ok = drutil_expand_rep_string(drcontext, bb);
|
|
|
|
DR_ASSERT(ok);
|
|
|
|
return DR_EMIT_DEFAULT;
|
|
|
|
}
|
|
|
|
|
|
|
|
typedef void (*prewrapper_t)(void *wrapctx, void **user_data);
|
|
|
|
typedef void (*postwrapper_t)(void *wrapctx, void *user_data);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Helper function for bulk use of drwrap.
|
|
|
|
*/
|
|
|
|
static void try_wrap_fn(const module_data_t *module, const char *name,
|
|
|
|
prewrapper_t pre, postwrapper_t post, bool *done)
|
|
|
|
{
|
|
|
|
if (*done)
|
|
|
|
return;
|
|
|
|
|
|
|
|
size_t offset;
|
|
|
|
drsym_error_t status = drsym_lookup_symbol(
|
|
|
|
module->full_path, name, &offset, DRSYM_DEFAULT_FLAGS);
|
|
|
|
if (status == DRSYM_SUCCESS) {
|
|
|
|
app_pc notify_fn = module->start + offset;
|
|
|
|
bool ok = drwrap_wrap(notify_fn, pre, post);
|
|
|
|
DR_ASSERT(ok);
|
|
|
|
*done = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* When each module (e.g. shared library) is loaded, try to wrap all
|
|
|
|
* the functions we care about. For each one, we keep a static bool
|
|
|
|
* that will stop us trying again once we've found it the first time.
|
|
|
|
*/
|
|
|
|
static void load_module(
|
|
|
|
void *drcontext, const module_data_t *module, bool loaded)
|
|
|
|
{
|
2020-11-26 17:57:25 +00:00
|
|
|
bool libc = !strncmp(dr_module_preferred_name(module), "libc", 4);
|
New test system to detect side channels in crypto code.
All the work I've put in in the last few months to eliminate timing
and cache side channels from PuTTY's mp_int and cipher implementations
has been on a seat-of-the-pants basis: just thinking very hard about
what kinds of language construction I think would be safe to use, and
trying not to absentmindedly leave a conditional branch or a cast to
bool somewhere vital.
Now I've got a test suite! The basic idea is that you run the same
crypto primitive multiple times, with inputs differing only in ways
that are supposed to avoid being leaked by timing or leaving evidence
in the cache; then you instrument the code so that it logs all the
control flow, memory access and a couple of other relevant things in
each of those runs, and finally, compare the logs and expect them to
be identical.
The instrumentation is done using DynamoRIO, which I found to be well
suited to this kind of work: it lets you define custom modifications
of the code in a reasonably low-effort way, and it lets you work at
both the low level of examining single instructions _and_ the higher
level of the function call ABI (so you can give things like malloc
special treatment, not to mention intercepting communications from the
program being instrumented). Build instructions are all in the comment
at the top of testsc.c.
At present, I've found this test to give a 100% pass rate using gcc
-O0 and -O3 (Ubuntu 18.10). With clang, there are a couple of
failures, which I'll fix in the next commit.
2019-02-10 13:09:53 +00:00
|
|
|
|
|
|
|
#define TRY_WRAP(fn, pre, post) do \
|
|
|
|
{ \
|
|
|
|
static bool done_this_one = false; \
|
2022-08-03 19:48:46 +00:00
|
|
|
try_wrap_fn(module, fn, pre, post, &done_this_one); \
|
New test system to detect side channels in crypto code.
All the work I've put in in the last few months to eliminate timing
and cache side channels from PuTTY's mp_int and cipher implementations
has been on a seat-of-the-pants basis: just thinking very hard about
what kinds of language construction I think would be safe to use, and
trying not to absentmindedly leave a conditional branch or a cast to
bool somewhere vital.
Now I've got a test suite! The basic idea is that you run the same
crypto primitive multiple times, with inputs differing only in ways
that are supposed to avoid being leaked by timing or leaving evidence
in the cache; then you instrument the code so that it logs all the
control flow, memory access and a couple of other relevant things in
each of those runs, and finally, compare the logs and expect them to
be identical.
The instrumentation is done using DynamoRIO, which I found to be well
suited to this kind of work: it lets you define custom modifications
of the code in a reasonably low-effort way, and it lets you work at
both the low level of examining single instructions _and_ the higher
level of the function call ABI (so you can give things like malloc
special treatment, not to mention intercepting communications from the
program being instrumented). Build instructions are all in the comment
at the top of testsc.c.
At present, I've found this test to give a 100% pass rate using gcc
-O0 and -O3 (Ubuntu 18.10). With clang, there are a couple of
failures, which I'll fix in the next commit.
2019-02-10 13:09:53 +00:00
|
|
|
} while (0)
|
|
|
|
|
|
|
|
if (loaded) {
|
|
|
|
TRY_WRAP("log_to_file_real", wrap_logsetfile, NULL);
|
|
|
|
TRY_WRAP("dry_run_real", NULL, wrap_dryrun);
|
2020-11-26 17:57:25 +00:00
|
|
|
if (libc) {
|
|
|
|
TRY_WRAP("malloc", wrap_malloc_pre, wrap_alloc_post);
|
Side-channel tester: align memory allocations.
While trying to get an upcoming piece of code through testsc, I had
trouble - _yet again_ - with the way that control flow diverges inside
the glibc implementations of functions like memcpy and memset,
depending on the alignment of the input blocks _above_ the alignment
guaranteed by malloc, so that doing the same sequence of malloc +
memset can lead to different control flow. (I believe this is done
either for cache performance reasons or SIMD alignment requirements,
or both: on x86, some SIMD instructions require memory alignment
beyond what malloc guarantees, which is also awkward for our x86
hardware crypto implementations.)
My previous effort to normalise this problem out of sclog's log files
worked by wrapping memset and all its synonyms that I could find. But
this weekend, that failed for me, and the reason appears to be ifuncs.
I'm aware of the great irony of committing code to a security project
with a log message saying something vague about ifuncs, on the same
weekend that it came to light that commits matching that description
were one of the methods used to smuggle a backdoor into the XZ Utils
project (CVE-2024-3094). So I'll bend over backwards to explain both
what I think is going on, and why this _isn't_ a weird ifunc-related
backdooring attempt:
When I say I 'wrap' memset, I mean I use DynamoRIO's 'drwrap' API to
arrange that the side-channel test rig calls a function of mine before
and after each call to memset. The way drwrap works is to look up the
symbol address in either the main program or a shared library; in this
case, it's a shared library, namely libc.so. Then it intercepts call
instructions with exactly that address as the target.
Unfortunately, what _actually_ happens when the main program calls
memset is more complicated. First, control goes to the PLT entry for
memset (still in the main program). In principle, that loads a GOT
entry containing the address of memset (filled in by ld.so), and jumps
to it. But in fact the GOT entry varies its value through the program;
on the first call, it points to a resolver function, whose job is to
_find out_ the address of memset. And in the version of libc.so I'm
currently running, that resolver is an STT_GNU_IFUNC indirection
function, which tests the host CPU's capabilities, and chooses an
actual implementation of memset depending on what it finds. (In my
case, it looks as if it's picking one that makes extensive use of x86
SIMD.) To avoid the overhead of doing this on every call, the returned
function pointer is then written into the main program's GOT entry for
memset, overwriting the address of the resolver function, so that the
_next_ call the main program makes through the same PLT entry will go
directly to the memset variant that was chosen.
And the problem is that, after this has happened, none of the new
control flow ever goes near the _official_ address of memset, as read
out of libc.so's dynamic symbol table by DynamoRIO. The PLT entry
isn't at that address, and neither is the particular SIMD variant that
the resolver ended up choosing. So now my wrapper on memset is never
being invoked, and memset cheerfully generates different control flow
in runs of my crypto code that testsc expects to be doing exactly the
same thing as each other, and all my tests fail spuriously.
My solution, at least for the moment, is to completely abandon the
strategy of wrapping memset. Instead, let's just make it behave the
same way every time, by forcing all the affected memory allocations to
have extra-strict alignment. I found that 64-byte alignment is not
good enough to eliminate memset-related test failures, but 128-byte
alignment is.
This would be tricky in itself, if it weren't for the fact that PuTTY
already has its own wrapper function on malloc (for various reasons),
which everything in our code already uses. So I can divert to C11's
aligned_alloc() there. That in turn is done by adding a new #ifdef to
utils/memory.c, and compiling it with that #ifdef into a new object
library that is included in testsc, superseding the standard memory.o
that would otherwise be pulled in from our 'utils' static library.
With the previous memset-compensator removed, this means testsc is now
dependent on having aligned_alloc() available. So we test for it at
cmake time, and don't build testsc at all if it can't be found. This
shouldn't bother anyone very much; aligned_alloc() is available on
_my_ testsc platform, and if anyone else is trying to run this test
suite at all, I expect it will be on something at least as new as
that.
(One awkward thing here is that we can only replace _new_ allocations
with calls to aligned_alloc(): C11 provides no aligned version of
realloc. Happily, this doesn't currently introduce any new problems in
testsc. If it does, I might have to do something even more painful in
future.)
So, why isn't this an ifunc-related backdoor attempt? Because (and you
can check all of this from the patch):
1. The memset-wrapping code exists entirely within the DynamoRIO
plugin module that lives in test/sclog. That is not used in
production, only for running the 'testsc' side-channel tester.
2. The memset-wrapping code is _removed_ by this patch, not added.
3. None of this code is dealing directly with ifuncs - only working
around the unwanted effects on my test suite from the fact that
they exist somewhere else and introduce awkward behaviour.
2024-04-01 07:48:36 +00:00
|
|
|
TRY_WRAP("aligned_alloc", wrap_aligned_alloc_pre, wrap_alloc_post);
|
2020-11-26 17:57:25 +00:00
|
|
|
TRY_WRAP("realloc", wrap_realloc_pre, wrap_alloc_post);
|
|
|
|
TRY_WRAP("free", wrap_free_pre, unpause_post);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* More strangely named versions of standard C library
|
|
|
|
* functions, which I've observed in practice to be where the
|
|
|
|
* calls end up. I think these are probably selected by
|
|
|
|
* STT_IFUNC in libc.so, so that the normally named version of
|
|
|
|
* the function is never reached at all.
|
|
|
|
*
|
|
|
|
* This list is not expected to be complete. If you re-run
|
|
|
|
* this test on a different platform and find control flow
|
|
|
|
* diverging inside some libc function that looks as if it's
|
|
|
|
* another name for malloc or memset or whatever, then you may
|
|
|
|
* need to add more aliases here to stop the test failing.
|
|
|
|
*/
|
|
|
|
TRY_WRAP("__GI___libc_malloc", wrap_malloc_pre, wrap_alloc_post);
|
|
|
|
TRY_WRAP("__libc_malloc", wrap_malloc_pre, wrap_alloc_post);
|
|
|
|
TRY_WRAP("__GI___libc_realloc", wrap_realloc_pre, wrap_alloc_post);
|
|
|
|
TRY_WRAP("__GI___libc_free", wrap_free_pre, unpause_post);
|
|
|
|
TRY_WRAP("cfree", wrap_free_pre, unpause_post);
|
|
|
|
}
|
New test system to detect side channels in crypto code.
All the work I've put in in the last few months to eliminate timing
and cache side channels from PuTTY's mp_int and cipher implementations
has been on a seat-of-the-pants basis: just thinking very hard about
what kinds of language construction I think would be safe to use, and
trying not to absentmindedly leave a conditional branch or a cast to
bool somewhere vital.
Now I've got a test suite! The basic idea is that you run the same
crypto primitive multiple times, with inputs differing only in ways
that are supposed to avoid being leaked by timing or leaving evidence
in the cache; then you instrument the code so that it logs all the
control flow, memory access and a couple of other relevant things in
each of those runs, and finally, compare the logs and expect them to
be identical.
The instrumentation is done using DynamoRIO, which I found to be well
suited to this kind of work: it lets you define custom modifications
of the code in a reasonably low-effort way, and it lets you work at
both the low level of examining single instructions _and_ the higher
level of the function call ABI (so you can give things like malloc
special treatment, not to mention intercepting communications from the
program being instrumented). Build instructions are all in the comment
at the top of testsc.c.
At present, I've found this test to give a 100% pass rate using gcc
-O0 and -O3 (Ubuntu 18.10). With clang, there are a couple of
failures, which I'll fix in the next commit.
2019-02-10 13:09:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Main entry point that sets up all the facilities we need.
|
|
|
|
*/
|
|
|
|
DR_EXPORT void dr_client_main(client_id_t id, int argc, const char **argv)
|
|
|
|
{
|
|
|
|
dr_set_client_name(
|
|
|
|
"Time-sensitive activity logger for PuTTY crypto testing",
|
|
|
|
"https://www.chiark.greenend.org.uk/~sgtatham/putty/");
|
|
|
|
|
|
|
|
outfile = INVALID_FILE;
|
|
|
|
|
|
|
|
bool ok = drmgr_init();
|
|
|
|
DR_ASSERT(ok);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Run our main instrumentation pass with lower priority than
|
|
|
|
* drwrap, so that we don't start logging the inside of a function
|
|
|
|
* whose drwrap pre-wrapper would have wanted to disable logging.
|
|
|
|
*/
|
|
|
|
drmgr_priority_t pri = {sizeof(pri), "sclog", NULL, NULL,
|
|
|
|
DRMGR_PRIORITY_INSERT_DRWRAP+1};
|
|
|
|
ok = drmgr_register_bb_instrumentation_event(
|
|
|
|
NULL, instrument_instr, &pri);
|
|
|
|
DR_ASSERT(ok);
|
|
|
|
|
|
|
|
ok = drutil_init();
|
|
|
|
DR_ASSERT(ok);
|
|
|
|
|
|
|
|
ok = drwrap_init();
|
|
|
|
DR_ASSERT(ok);
|
|
|
|
|
|
|
|
drsym_error_t symstatus = drsym_init(0);
|
|
|
|
DR_ASSERT(symstatus == DRSYM_SUCCESS);
|
|
|
|
|
|
|
|
dr_register_exit_event(exit_event);
|
|
|
|
|
|
|
|
drreg_options_t ops = { sizeof(ops), 3, false };
|
|
|
|
drreg_status_t regstatus = drreg_init(&ops);
|
|
|
|
DR_ASSERT(regstatus == DRREG_SUCCESS);
|
|
|
|
|
|
|
|
drmgr_register_module_load_event(load_module);
|
|
|
|
|
|
|
|
ok = drmgr_register_bb_app2app_event(expand_rep_movsb, NULL);
|
|
|
|
DR_ASSERT(ok);
|
|
|
|
}
|