SlideShare a Scribd company logo
1 of 152
STABILIZER
[ASPLOS 2013]
Emery Berger, Charlie Curtsinger
Statistically Sound
Performance Evaluation
We all care about performance evaluation
We’ve been doing it wrong
STABILIZER
Repeated runs and error bars not enough
We’re not measuring what we thought
changing a program changes its layout
STABILIZER
Memory layout affects performance
STABILIZER eliminates the effect of layout
no way to measure effect of change in isolation
evaluation of LLVM’s optimizations with STABILIZER
Case Studies
enables sound performance evaluation
We’ve been doing it wrong
STABILIZER
We’ve been doing it wrong
changing a program changes its layout
Memory layout affects performance
STABILIZER eliminates the effect of layout
no way to measure effect of change in isolation
evaluation of LLVM’s optimizations with STABILIZER
Case Studies
enables sound performance evaluation
A
Unsound performance
evaluation
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
asm("isync");
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
A
Unsound performance
evaluation
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
asm("isync");
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
A
Unsound performance
evaluation
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
size_t meaning_of_life=42;
for (size_t i = 0; i < size; i += 32) {
asm("icbi 0,%0" : : "r"(p));
p += 32;
}
for (size_t i = 16; i < size; i += 32) {
asm("icbi 0,%0" : : "r"(p));
p += 32;
}
asm("isync");
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
A
Unsound performance
evaluation
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
size_t meaning_of_life=42;
for (size_t i = 0; i < size; i += 32) {
asm("icbi 0,%0" : : "r"(p));
p += 32;
}
for (size_t i = 16; i < size; i += 32) {
asm("icbi 0,%0" : : "r"(p));
p += 32;
}
asm("isync");
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
A
Unsound performance
evaluation
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
size_t meaning_of_life=42;
for (size_t i = 0; i < size; i += 32) {
asm("icbi 0,%0" : : "r"(p));
p += 32;
}
for (size_t i = 16; i < size; i += 32) {
asm("icbi 0,%0" : : "r"(p));
p += 32;
}
asm("isync");
}
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
A
Unsound performance
evaluation
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
size_t meaning_of_life=42;
for (size_t i = 0; i < size; i += 32) {
asm("icbi 0,%0" : : "r"(p));
p += 32;
}
for (size_t i = 16; i < size; i += 32) {
asm("icbi 0,%0" : : "r"(p));
p += 32;
}
asm("isync");
}
A′
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
A
Unsound performance
evaluation
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
size_t meaning_of_life=42;
for (size_t i = 0; i < size; i += 32) {
asm("icbi 0,%0" : : "r"(p));
p += 32;
}
for (size_t i = 16; i < size; i += 32) {
asm("icbi 0,%0" : : "r"(p));
p += 32;
}
asm("isync");
}
A′A
Unsound performance
evaluation
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
size_t meaning_of_life=42;
for (size_t i = 0; i < size; i += 32) {
asm("icbi 0,%0" : : "r"(p));
p += 32;
}
for (size_t i = 16; i < size; i += 32) {
asm("icbi 0,%0" : : "r"(p));
p += 32;
}
asm("isync");
}
0.00
0.25
0.50
0.75
1.00
85.0 87.5 90.0 92.5 95.0
Time (s)
Numberofruns
Version
A
A'
A′A ×1 ×1
Which is faster?
0.00
0.25
0.50
0.75
1.00
85.0 87.5 90.0 92.5 95.0
Time (s)
Numberofruns
Version
A
A'
Is faster than ?A′ A
0.00
0.25
0.50
0.75
1.00
85.0 87.5 90.0 92.5 95.0
Time (s)
Numberofruns
Version
A
A'
Is faster than ?A′ A
0.00
0.25
0.50
0.75
1.00
85.0 87.5 90.0 92.5 95.0
Time (s)
Numberofruns
Version
A
A'
Is faster than ?A′ A
0.00
0.25
0.50
0.75
1.00
85.0 87.5 90.0 92.5 95.0
Time (s)
Numberofruns
Version
A
A'
Is faster than ?A′ A
what about
variance?
2.8% faster
0
5
10
15
85.0 87.5 90.0 92.5 95.0
Time (s)
Numberofruns
Version
A
A'
Which is faster?
A′A ×30 ×30
0
5
10
15
85.0 87.5 90.0 92.5 95.0
Time (s)
Numberofruns
Version
A
A'
Is faster than ?A′ A
still
2.8% faster
Why is faster than ?A′ A
Why is faster than ?A′ A
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
asm("isync");
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
Why is faster than ?A′ A
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
asm("isync");
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
Was it the
code change?
Why is faster than ?A′ A
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
asm("isync");
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
Or was it the
new layout?
Why is faster than ?A′ A
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
asm("isync");
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
Mytkowicz et al. (ASPLOS’09)
Layout biases measurement
Layout biases measurement
Mytkowicz et al. (ASPLOS’09)
Link Order
Environment
Variable Size
Changes function addresses
Moves the program stack
Layout biases measurement
Mytkowicz et al. (ASPLOS’09)
Link Order
Environment
Variable Size
Changes function addresses
Moves the program stack
Larger than
impact of -O3
Blame the cache
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
asm("isync");
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
A
Blame the cache
A
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
asm("isync");
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
asm("isync");
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
conflict
map to same
cache set
A′
Blame the cache
A
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
asm("isync");
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
Blame the cache
A
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
asm("isync");
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
map to
same set
A′
Nothing here
no conflict
A′
Blame the cache
A
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
asm("isync");
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
or branch predictor
or TLB
or prefetcher
or branch
target predictor
Blame the hash
A
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
int main(int argc, char **argv) {
topFrame = (void**)__builtin_frame_address(0);
setHandler(Trap::TrapSignal, onTrap);
setHandler(SIGALRM, onTimer);
setHandler(SIGSEGV, onFault);
for(Function* f: functions) {
f->setTrap();
}
setTimer(interval);
int r = stabilizer_main(argc, argv);
return r;
}
void setTimer(int msec) {
struct itimerval timer;
timer.it_value.tv_sec = (msec - msec % 1000) / 1000;
timer.it_value.tv_usec = 1000 * (msec % 1000);
timer.it_interval.tv_sec = 0;
timer.it_interval.tv_usec = 0;
setitimer(ITIMER_REAL, &timer, 0);
}
static void flush_icache(void* begin, size_t size) {
uintptr_t p = (uintptr_t)begin & ~15UL;
for (size_t i = 0; i < size; i += 16) {
asm("icbi 0,%0" : : "r"(p));
p += 16;
}
asm("isync");
}
DataHeapType* getDataHeap() {
static char buf[sizeof(DataHeapType)];
static DataHeapType* _theDataHeap = new (buf) DataHeapType;
return _theDataHeap;
}
A′
it’s faster it’s faster
it’s faster
it’s faster
Is faster than ?A′ A
Let’s do a poll
Do we trust this?
it’s faster it’s faster
it’s faster
it’s faster
it’s faster it’s slower they’re the same
Is faster than ?A′ A
it’s faster it’s slower they’re the same
But it ran faster!
What if we only talk to Bob?
it’s faster
But it ran faster!
What if we only use this layout?
it’s slower they’re the same
it’s faster
But it ran faster!
What if we only use this layout?
it’s faster
But it ran faster!
What if we only use this layout?
Upgrade libc
Changes layout
it’s faster
But it ran faster!
What if we only use this layout?
Change Username
Changes layout
Layout is Brittle
it’s faster
But it ran faster!
What if we only use this layout?
Run in a new
directory
Changes layout
Layout is Brittle
But it ran faster!
What if we only use this layout?
Layout biases measurement
Mytkowicz et al. (ASPLOS’09)
Can we eliminate the
effect of layout?
But it ran faster!
What if we only use this layout?
Layout biases measurement
Can we eliminate the
effect of layout?
YES
STABILIZER
Memory layout affects performance
STABILIZER eliminates the effect of layout
enables sound performance evaluation
evaluation of LLVM’s optimizations with STABILIZER
Case Studies
makes performance evaluation difficult
STABILIZER
Memory layout affects performance
STABILIZER eliminates the effect of layout
enables sound performance evaluation
evaluation of LLVM’s optimizations with STABILIZER
Case Studies
makes performance evaluation difficult
Layout biases measurement
STABILIZER
function addresses stack frame sizes
heap allocations
randomizes layout
STABILIZER
randomizes layout
function addresses stack frame sizes
heap allocations
repeatedly
Layout biases measurement
during
execution
STABILIZER
randomizes layout
function addresses stack frame sizes
heap allocations
repeatedly
Layout biases measurement
a completely random layout
cannot bias results
A′A ×30 ×30
Sound Performance
Evaluation
A ×30 ×30
0%
10%
20%
30%
40%
85.0 87.5 90.0 92.5 95.0
Time (s)
PercentofObservedRuntimes
A′
Sound Performance
Evaluation
Is faster than ?A′ A
0%
10%
20%
30%
40%
85.0 87.5 90.0 92.5 95.0
Time (s)
PercentofObservedRuntimes
Is faster than ?A
0%
10%
20%
30%
40%
85.0 87.5 90.0 92.5 95.0
Time (s)
PercentofObservedRuntimes
A′
Is faster than ?A
0%
10%
20%
30%
40%
85.0 87.5 90.0 92.5 95.0
Time (s)
PercentofObservedRuntimes
A′
Is faster than ?A
0%
10%
20%
30%
40%
85.0 87.5 90.0 92.5 95.0
Time (s)
PercentofObservedRuntimes
A′
AA′
what is the probability of measuring
a speedup this large by chance?
The Statistical Approach
If =
hypothesis testing
what is the probability of measuring
a speedup this large by chance?
easy to compute for
the normal distribution
AA′If =
easy to compute for
the normal distribution
AA′If =
STABILIZER
randomizes layoutrepeatedly
0%
10%
20%
30%
40%
85.0 87.5 90.0 92.5 95.0
Time (s)
PercentofObservedRuntimes
what is the probability of measuring
a speedup this large by chance?
if there is a
low probability
STABILIZER
randomizes layoutrepeatedly
this speedup is real
0%
10%
20%
30%
40%
85.0 87.5 90.0 92.5 95.0
Time (s)
PercentofObservedRuntimes
not due to the effect
on memory layout
this speedup is real
0%
10%
20%
30%
40%
85.0 87.5 90.0 92.5 95.0
Time (s)
PercentofObservedRuntimes
Sound Performance
Evaluation
STABILIZER
randomizes layoutrepeatedly
what does
re-randomization do?
not due to the effect
on memory layout
0.0
0.2
0.4
350 360 370 380 390 400
Time (s)
ProbabilityDensity
STABILIZER
randomizes layoutrepeatedly
one random
layout per-run
STABILIZER
randomizes layoutrepeatedly
0.0
0.2
0.4
350 360 370 380 390 400
Time (s)
ProbabilityDensity
many random
layouts in each run
one random
layout per-run
STABILIZER generates a new
random layout every ½ second
Total execution time is
the sum of all periods
STABILIZER
randomizes layoutrepeatedly
STABILIZER generates a new
random layout every ½ second
Total execution time is
the sum of all periods
The sum of a sufficient number of
independent, identically distributed random
variables is approximately normally distributed.
STABILIZER
randomizes layoutrepeatedly
STABILIZER generates a new
random layout every ½ second
Total execution time is
the sum of all periods
The sum of a sufficient number of
independent, identically distributed random
variables is approximately normally distributed.
STABILIZER
randomizes layoutrepeatedly
STABILIZER generates a new
random layout every ½ second
Total execution time is
the sum of all periods
The sum of a sufficient number of
independent, identically distributed random
variables is approximately normally distributed.
STABILIZER
randomizes layoutrepeatedly
Central Limit Theorem
execution times are
normally distributed
The sum of a sufficient number of
independent, identically distributed random
variables is approximately normally distributed.
STABILIZER
Memory layout affects performance
STABILIZER eliminates the effect of layout
enables sound performance evaluation
evaluation of LLVM’s optimizations with STABILIZER
Case Studies
makes performance evaluation difficult
STABILIZER
makes performance evaluation difficult
Memory layout affects performance
STABILIZER eliminates the effect of layout
enables sound performance evaluation
evaluation of LLVM’s optimizations with STABILIZER
Case Studies
Case Studies
on each benchmark
across the whole
benchmark suite
evaluation of LLVM’s optimizations with STABILIZER
first, build benchmarks with STABILIZER
Build programs with
STABILIZER
> szc main.c
> szc main.c
Build programs with
STABILIZER
> szc main.c-Rcode
Build programs with
STABILIZER
> szc main.c-Rcode -Rheap -Rstack
Build programs with
STABILIZER
now run the benchmarks
0%
10%
20%
30%
40%
85.0 87.5 90.0 92.5 95.0
Time (s)
PercentofObservedRuntimes
Run benchmarks as usual
A′A ×30 ×30
drop the results into R
Is faster than ?A
0%
10%
20%
30%
40%
85.0 87.5 90.0 92.5 95.0
Time (s)
PercentofObservedRuntimes
A′
A′
0%
10%
20%
30%
40%
85.0 87.5 90.0 92.5 95.0
Time (s)
PercentofObservedRuntimes
If = A
If =A′ A
A′If = A
what is the probability of measuring a
difference at least this large?
what is the probability of measuring a
difference at least this large?
A′
The Student’s t-test
If = A
t.test(times.A′, times.A)
what is the probability of measuring a
difference at least this large?
A′If = A
The Student’s t-test
If p-value
A′
The Student’s t-test
If p-value ≤ 5%
If = A
95% Confidence
what is the probability of measuring a
difference at least this large?
If =A′ A
The Student’s t-test
If p-value ≤ 5%
we reject the null hypothesis
what is the probability of measuring a
difference at least this large?
≠ A
The Student’s t-test
If p-value ≤ 5%
we reject the null hypothesis
Random chance not responsible for
the measured difference
A′
≠ A
The difference is real
A′
−10%
0%
10%
20%
libquantum
m
ilc
bzip2sphinx3
nam
d
lbmperlbenchhm
m
erh264ref
cactusA
DM
w
rf
sjenggobm
kgrom
acszeusm
p
m
cf
gcc
astar
Speedup
Significant
Yes
No
Speedup of -O2 over -O1
−10%
0%
10%
20%
libquantum
m
ilc
bzip2sphinx3
nam
d
lbmperlbenchhm
m
erh264ref
cactusA
DM
w
rf
sjenggobm
kgrom
acszeusm
p
m
cf
gcc
astar
Speedup
Significant
Yes
No
Speedup of -O2 over -O1
−10%
0%
10%
20%
libquantum
m
ilc
bzip2sphinx3
nam
d
lbmperlbenchhm
m
erh264ref
cactusA
DM
w
rf
sjenggobm
kgrom
acszeusm
p
m
cf
gcc
astar
Speedup
Significant
Yes
No
Speedup of -O2 over -O1
−10%
0%
10%
20%
libquantum
m
ilc
bzip2sphinx3
nam
d
lbmperlbenchhm
m
erh264ref
cactusA
DM
w
rf
sjenggobm
kgrom
acszeusm
p
m
cf
gcc
astar
Speedup
Significant
Yes
No
Speedup of -O2 over -O1
−10%
0%
10%
20%
bzip2gobm
kzeusm
p
libquantum
w
rf
astar
m
cfhm
m
er
m
ilc
nam
d
gcc
lbmgrom
acsh264ref
cactusA
DMperlbenchsphinx3
sjeng
Speedup
Significant
Yes
No
Speedup of -O3 over -O2
Speedup of -O3 over -O2
0.0%
0.5%
1.0%
1.5%
bzip2gobm
kzeusm
p
libquantum
w
rf
astar
m
cfhm
m
er
m
ilc
nam
d
gcc
lbmgrom
acsh264ref
cactusA
DMperlbenchsphinx3
sjeng
Speedup
Significant
Yes
No
0.0%
0.5%
1.0%
1.5%
bzip2gobm
kzeusm
p
libquantum
w
rf
astar
m
cfhm
m
er
m
ilc
nam
d
gcc
lbmgrom
acsh264ref
cactusA
DMperlbenchsphinx3
sjeng
Speedup
Significant
Yes
No
Speedup of -O3 over -O2
0.0%
0.5%
1.0%
1.5%
bzip2gobm
kzeusm
p
libquantum
w
rf
astar
m
cfhm
m
er
m
ilc
nam
d
gcc
lbmgrom
acsh264ref
cactusA
DMperlbenchsphinx3
sjeng
Speedup
Significant
Yes
No
Speedup of -O3 over -O2
0.0%
0.5%
1.0%
1.5%
bzip2gobm
kzeusm
p
libquantum
w
rf
astar
m
cfhm
m
er
m
ilc
nam
d
gcc
lbmgrom
acsh264ref
cactusA
DMperlbenchsphinx3
sjeng
Speedup
Significant
Yes
No
Speedup of -O3 over -O2
0.0%
0.5%
1.0%
1.5%
bzip2gobm
kzeusm
p
libquantum
w
rf
astar
m
cfhm
m
er
m
ilc
nam
d
gcc
lbmgrom
acsh264ref
cactusA
DMperlbenchsphinx3
sjeng
Speedup
Significant
Yes
No
What do the results mean?
Comparing optimizations
-O2 -O3×30 ×30
-O2 -O3
×30 ×30lbm lbm
×30 ×30
Comparing optimizations
-O2 -O3
×30 ×30lbm lbm
×30 ×30astar astar
×30 ×30
Comparing optimizations
-O2 -O3
×30 ×30lbm lbm
×30 ×30astar astar
...
×30 ×30
Comparing optimizations
-O2 -O3×30 ×30
0%
1%
2%
3%
4%
0 25 50 75 100
Time (s)
PercentofObservedRuntimes
Version
−O2
−O3
Comparing optimizations
-O2-O3
0%
1%
2%
3%
4%
0 25 50 75 100
Time (s)
PercentofObservedRuntimes
Version
−O2
−O3
Is faster than ?
-O2-O3
0%
1%
2%
3%
4%
0 25 50 75 100
Time (s)
PercentofObservedRuntimes
Version
−O2
−O3
Is faster than ?
-O2-O3
0%
1%
2%
3%
4%
0 25 50 75 100
Time (s)
PercentofObservedRuntimes
Version
−O2
−O3
Is faster than ?
-O2-O3
0%
1%
2%
3%
4%
0 25 50 75 100
Time (s)
PercentofObservedRuntimes
Version
−O2
−O3
Is faster than ?
If = -O2-O3
0%
1%
2%
3%
4%
0 25 50 75 100
Time (s)
PercentofObservedRuntimes
Version
−O2
−O3
If = -O2-O3
1%
2%
3%
4%
PercentofObservedRuntimes
Version
−O2
−O3
If = -O2-O3
1%
2%
3%
4%
PercentofObservedRuntimes
Version
−O2
−O3
what is the probability of measuring
these differences?
If = -O2-O3
4%
imes
what is the probability of measuring
these differences?
Analysis ofVariance
aov(time~opt+Error(benchmark/opt), times)
If = -O2-O3
4%
imes
what is the probability of measuring
these differences?
Analysis ofVariance
If p-value
If = -O2-O3
4%
imes
what is the probability of measuring
these differences?
Analysis ofVariance
If p-value ≤ 5%
If = -O2-O3
4%
imes
what is the probability of measuring
these differences?
Analysis ofVariance
If p-value ≤ 5%
we reject the null hypothesis
Analysis ofVariance
If p-value ≤ 5%
we reject the null hypothesis
p-value = 26.4%
-O3 -O2vs
are we 73.6% confident?
one in four experiments will show an
effect that does not exist!
Analysis ofVariance
If p-value ≤ 5%
we reject the null hypothesis
p-value = 26.4%
fail to reject the null hypothesis
-O3 -O2vs
-O3
Analysis ofVariance
we reject the null hypothesis
-O2The effect of over is
indistinguishable from noise
If p-value ≤ 5%
Did STABILIZER hide the effect?
Runtime with -O2
Runtime with -O3
Runtime with -O1
Runtime with -O0
Execution Time
Did STABILIZER hide the effect?
STABILIZER
STABILIZER
STABILIZERSTABILIZER
Execution Time
Runtime with -O2
Runtime with -O3
Runtime with -O1
Runtime with -O0
speedups
Did STABILIZER hide the effect?
Runtime with -O2
Runtime with -O3
Runtime with -O1
Runtime with -O0
STABILIZER
STABILIZER
STABILIZERSTABILIZER
Execution Time
speedups
Did STABILIZER hide the effect?
SPECint SPECfp Summary
0%
10%
20%
30%
40%
perlbench
gcc
gobm
kh264ref
sjeng
astar
bzip2
libquantum
hm
m
er
m
cfcactusA
DM
zeusm
p
w
rfgrom
acssphinx3
lbm
m
ilc
nam
d
hm
eanm
edian
%IncreaseinExecutionTime
STABILIZER is fast enough
STABILIZER
Memory layout affects performance
STABILIZER eliminates the effect of layout
showed that -O3 does not have a statistically
significant effect across our benchmarks
Case Studies
random layout enables sound performance evaluation
makes performance evaluation difficult
available at
stabilizer-tool.org
STABILIZER
PhD Fellowship: Charlie Curtsinger
Backup Slides
LLVM’s Optimizations
• Function Inlining
• Unroll Loops
• GlobalValue
Numbering
• Dead Global
Elimination
• Merge Duplicate
Global Constants
• Argument Promotion
• Global CSE
• Scalar Replacement of
Aggregates
-O2 -O3
Mechanisms
STABILIZER randomizes
stack frame size
argv
argc
locals
arguments
main
STABILIZER randomizes
stack frame size
argv
argc
locals
arguments
main
return address
frame ptr
locals
arguments
foo
STABILIZER randomizes
stack frame size
argv
argc
locals
arguments
main
return address
frame ptr
locals
arguments
foo
return address
frame ptr bar
argv
argc
locals
arguments
main
return address
frame ptr
locals
arguments
foo
STABILIZER randomizes
stack frame size
return address
frame ptr
locals
locals
arguments
main
return address
frame ptr
locals
arguments
foo
bar
STABILIZER randomizes
stack frame size
return address
frame ptr
locals
locals
arguments
main
return address
frame ptr
locals
arguments
foo
bar
STABILIZER randomizes
stack frame size
Random
Padding
foo
bar
baz
STABILIZER randomizes
function placement
foo
bar
baz
trap
trap
trap
STABILIZER randomizes
function placement
foo
bar
baz
jmp
trap
trap
foo′
STABILIZER randomizes
function placement
foo
bar
baz
jmp
trap
trap
&bar
&baz
&g
STABILIZER randomizes
function placement
foo′
foo
bar
baz
jmp
trap
trap
STABILIZER randomizes
function placement
&bar
&baz
&g
foo′
foo
bar
baz
jmp
trap
baz′
&foo
&bar
jmp
STABILIZER randomizes
function placement
&bar
&baz
&g
foo′
and re-randomizes during execution
Randomizes placement of
heap objects
stack frames
functions
foo
bar
baz
jmp
trap
baz′
&foo
&bar
jmp
STABILIZER re-randomizes
function placement
&bar
&baz
&g
foo′
foo
bar
baz
jmp
trap
jmp
baz′
&foo
&bar
&bar
&baz
&g
foo′
STABILIZER re-randomizes
function placement
foo
bar
baz
trap
baz′
&foo
&bar
&bar
&baz
&g
foo′
trap
trap
STABILIZER re-randomizes
function placement
foo
bar
baz
trap
baz′
&foo
&bar
&bar
&baz
&g
foo′
trap
trap
STABILIZER re-randomizes
function placement
foo
bar
baz
trap
baz′
&foo
&bar
&bar
&baz
&g
foo′
trap
jmp
baz″
&foo
&bar
STABILIZER re-randomizes
function placement
foo
bar
baz
trap &bar
&baz
&g
foo′
trap
jmp
baz″
&foo
&bar
STABILIZER re-randomizes
function placement
0.0
0.2
0.4
0.6
350 360 370 380 390 400
Time (s)
ProbabilityDensity
0.0
0.2
0.4
0.6
350 360 370 380 390 400
Time (s)
ProbabilityDensity
0.0
0.2
0.4
0.6
350 360 370 380 390 400
Time (s)
ProbabilityDensity
0.0
0.2
0.4
0.6
350 360 370 380 390 400
Time (s)
ProbabilityDensity
0.0
0.2
0.4
0.6
350 360 370 380 390 400
Time (s)
ProbabilityDensity
Programs with phases
Execution Time
Programs with phases
A B A B AA B A B A
Execution Time
A B A B A
Execution Time
A A A
B B
~ N(μA, σA)
Programs with phases
~ N(μB, σB)
A B A B A
Execution Time
A A A
B B
~ N(μA, σA)
Programs with phases
~ N(μB, σB)
~ A + B
A B A B A
Execution Time
Programs with phases
~ N(μA, σA) + N(μB, σB)
~ A + B
= N(μA+μB, σA+σB)

More Related Content

What's hot

Double linked list
Double linked listDouble linked list
Double linked listSayantan Sur
 
RxJS - 封裝程式的藝術
RxJS - 封裝程式的藝術RxJS - 封裝程式的藝術
RxJS - 封裝程式的藝術名辰 洪
 
โครงงาน เครื่องคิดเลข
โครงงาน เครื่องคิดเลขโครงงาน เครื่องคิดเลข
โครงงาน เครื่องคิดเลขBung Lfkglialbmk
 
Hashing enderecamento aberto bean - bean
Hashing enderecamento aberto bean - beanHashing enderecamento aberto bean - bean
Hashing enderecamento aberto bean - beanElaine Cecília Gatto
 
Travel management
Travel managementTravel management
Travel management1Parimal2
 
Data Structures Using C Practical File
Data Structures Using C Practical File Data Structures Using C Practical File
Data Structures Using C Practical File Rahul Chugh
 
Call stack, event loop and async programming
Call stack, event loop and async programmingCall stack, event loop and async programming
Call stack, event loop and async programmingMasters Academy
 
Data Structures Practical File
Data Structures Practical File Data Structures Practical File
Data Structures Practical File Harjinder Singh
 
Basic Programs of C++
Basic Programs of C++Basic Programs of C++
Basic Programs of C++Bharat Kalia
 
Assignment no39
Assignment no39Assignment no39
Assignment no39Jay Patel
 
ภาษาซี
ภาษาซีภาษาซี
ภาษาซีkramsri
 
ภาษาซี
ภาษาซีภาษาซี
ภาษาซีkramsri
 
Solutionsfor co2 C Programs for data structures
Solutionsfor co2 C Programs for data structuresSolutionsfor co2 C Programs for data structures
Solutionsfor co2 C Programs for data structuresLakshmi Sarvani Videla
 
Understanding storage class using nm
Understanding storage class using nmUnderstanding storage class using nm
Understanding storage class using nmmohamed sikander
 
Hashing endereçamento aberto - main
Hashing endereçamento aberto - mainHashing endereçamento aberto - main
Hashing endereçamento aberto - mainElaine Cecília Gatto
 

What's hot (19)

Double linked list
Double linked listDouble linked list
Double linked list
 
Arrays
ArraysArrays
Arrays
 
RxJS - 封裝程式的藝術
RxJS - 封裝程式的藝術RxJS - 封裝程式的藝術
RxJS - 封裝程式的藝術
 
โครงงาน เครื่องคิดเลข
โครงงาน เครื่องคิดเลขโครงงาน เครื่องคิดเลข
โครงงาน เครื่องคิดเลข
 
Hashing enderecamento aberto bean - bean
Hashing enderecamento aberto bean - beanHashing enderecamento aberto bean - bean
Hashing enderecamento aberto bean - bean
 
Travel management
Travel managementTravel management
Travel management
 
C questions
C questionsC questions
C questions
 
Data Structures Using C Practical File
Data Structures Using C Practical File Data Structures Using C Practical File
Data Structures Using C Practical File
 
Call stack, event loop and async programming
Call stack, event loop and async programmingCall stack, event loop and async programming
Call stack, event loop and async programming
 
Stl algorithm-Basic types
Stl algorithm-Basic typesStl algorithm-Basic types
Stl algorithm-Basic types
 
C lab programs
C lab programsC lab programs
C lab programs
 
Data Structures Practical File
Data Structures Practical File Data Structures Practical File
Data Structures Practical File
 
Basic Programs of C++
Basic Programs of C++Basic Programs of C++
Basic Programs of C++
 
Assignment no39
Assignment no39Assignment no39
Assignment no39
 
ภาษาซี
ภาษาซีภาษาซี
ภาษาซี
 
ภาษาซี
ภาษาซีภาษาซี
ภาษาซี
 
Solutionsfor co2 C Programs for data structures
Solutionsfor co2 C Programs for data structuresSolutionsfor co2 C Programs for data structures
Solutionsfor co2 C Programs for data structures
 
Understanding storage class using nm
Understanding storage class using nmUnderstanding storage class using nm
Understanding storage class using nm
 
Hashing endereçamento aberto - main
Hashing endereçamento aberto - mainHashing endereçamento aberto - main
Hashing endereçamento aberto - main
 

Similar to Stabilizer: Statistically Sound Performance Evaluation

The Ring programming language version 1.9 book - Part 94 of 210
The Ring programming language version 1.9 book - Part 94 of 210The Ring programming language version 1.9 book - Part 94 of 210
The Ring programming language version 1.9 book - Part 94 of 210Mahmoud Samir Fayed
 
COA_remaining_lab_works_077BCT033.pdf
COA_remaining_lab_works_077BCT033.pdfCOA_remaining_lab_works_077BCT033.pdf
COA_remaining_lab_works_077BCT033.pdfJavedAnsari236392
 
Solve the coding errors for upvotemake test-statsg++ -g -std=c++.pdf
Solve the coding errors for upvotemake test-statsg++ -g -std=c++.pdfSolve the coding errors for upvotemake test-statsg++ -g -std=c++.pdf
Solve the coding errors for upvotemake test-statsg++ -g -std=c++.pdfsnewfashion
 
Programming with GUTs
Programming with GUTsProgramming with GUTs
Programming with GUTsKevlin Henney
 
An imperative study of c
An imperative study of cAn imperative study of c
An imperative study of cTushar B Kute
 
Developer Experience i TypeScript. Najbardziej ikoniczne duo
Developer Experience i TypeScript. Najbardziej ikoniczne duoDeveloper Experience i TypeScript. Najbardziej ikoniczne duo
Developer Experience i TypeScript. Najbardziej ikoniczne duoThe Software House
 
I have written the code but cannot complete the assignment please help.pdf
I have written the code but cannot complete the assignment please help.pdfI have written the code but cannot complete the assignment please help.pdf
I have written the code but cannot complete the assignment please help.pdfshreeaadithyaacellso
 
The IoT Academy IoT Training Arduino Part 3 programming
The IoT Academy IoT Training Arduino Part 3 programmingThe IoT Academy IoT Training Arduino Part 3 programming
The IoT Academy IoT Training Arduino Part 3 programmingThe IOT Academy
 
Статичный SQL в С++14. Евгений Захаров ➠ CoreHard Autumn 2019
Статичный SQL в С++14. Евгений Захаров ➠  CoreHard Autumn 2019Статичный SQL в С++14. Евгений Захаров ➠  CoreHard Autumn 2019
Статичный SQL в С++14. Евгений Захаров ➠ CoreHard Autumn 2019corehard_by
 
FileName EX06_1java Programmer import ja.pdf
FileName EX06_1java Programmer  import ja.pdfFileName EX06_1java Programmer  import ja.pdf
FileName EX06_1java Programmer import ja.pdfactocomputer
 
The Ring programming language version 1.7 book - Part 87 of 196
The Ring programming language version 1.7 book - Part 87 of 196The Ring programming language version 1.7 book - Part 87 of 196
The Ring programming language version 1.7 book - Part 87 of 196Mahmoud Samir Fayed
 
The Ring programming language version 1.5.3 book - Part 91 of 184
The Ring programming language version 1.5.3 book - Part 91 of 184The Ring programming language version 1.5.3 book - Part 91 of 184
The Ring programming language version 1.5.3 book - Part 91 of 184Mahmoud Samir Fayed
 

Similar to Stabilizer: Statistically Sound Performance Evaluation (20)

Sysprog 12
Sysprog 12Sysprog 12
Sysprog 12
 
Sysprog 12
Sysprog 12Sysprog 12
Sysprog 12
 
The Ring programming language version 1.9 book - Part 94 of 210
The Ring programming language version 1.9 book - Part 94 of 210The Ring programming language version 1.9 book - Part 94 of 210
The Ring programming language version 1.9 book - Part 94 of 210
 
Sysprog 13
Sysprog 13Sysprog 13
Sysprog 13
 
Cpds lab
Cpds labCpds lab
Cpds lab
 
COA_remaining_lab_works_077BCT033.pdf
COA_remaining_lab_works_077BCT033.pdfCOA_remaining_lab_works_077BCT033.pdf
COA_remaining_lab_works_077BCT033.pdf
 
Code optimization
Code optimization Code optimization
Code optimization
 
Code optimization
Code optimization Code optimization
Code optimization
 
Solve the coding errors for upvotemake test-statsg++ -g -std=c++.pdf
Solve the coding errors for upvotemake test-statsg++ -g -std=c++.pdfSolve the coding errors for upvotemake test-statsg++ -g -std=c++.pdf
Solve the coding errors for upvotemake test-statsg++ -g -std=c++.pdf
 
Programming with GUTs
Programming with GUTsProgramming with GUTs
Programming with GUTs
 
An imperative study of c
An imperative study of cAn imperative study of c
An imperative study of c
 
Developer Experience i TypeScript. Najbardziej ikoniczne duo
Developer Experience i TypeScript. Najbardziej ikoniczne duoDeveloper Experience i TypeScript. Najbardziej ikoniczne duo
Developer Experience i TypeScript. Najbardziej ikoniczne duo
 
I have written the code but cannot complete the assignment please help.pdf
I have written the code but cannot complete the assignment please help.pdfI have written the code but cannot complete the assignment please help.pdf
I have written the code but cannot complete the assignment please help.pdf
 
The IoT Academy IoT Training Arduino Part 3 programming
The IoT Academy IoT Training Arduino Part 3 programmingThe IoT Academy IoT Training Arduino Part 3 programming
The IoT Academy IoT Training Arduino Part 3 programming
 
Статичный SQL в С++14. Евгений Захаров ➠ CoreHard Autumn 2019
Статичный SQL в С++14. Евгений Захаров ➠  CoreHard Autumn 2019Статичный SQL в С++14. Евгений Захаров ➠  CoreHard Autumn 2019
Статичный SQL в С++14. Евгений Захаров ➠ CoreHard Autumn 2019
 
C lab programs
C lab programsC lab programs
C lab programs
 
FileName EX06_1java Programmer import ja.pdf
FileName EX06_1java Programmer  import ja.pdfFileName EX06_1java Programmer  import ja.pdf
FileName EX06_1java Programmer import ja.pdf
 
The Ring programming language version 1.7 book - Part 87 of 196
The Ring programming language version 1.7 book - Part 87 of 196The Ring programming language version 1.7 book - Part 87 of 196
The Ring programming language version 1.7 book - Part 87 of 196
 
The Ring programming language version 1.5.3 book - Part 91 of 184
The Ring programming language version 1.5.3 book - Part 91 of 184The Ring programming language version 1.5.3 book - Part 91 of 184
The Ring programming language version 1.5.3 book - Part 91 of 184
 
Arduino Programming
Arduino ProgrammingArduino Programming
Arduino Programming
 

More from Emery Berger

Doppio: Breaking the Browser Language Barrier
Doppio: Breaking the Browser Language BarrierDoppio: Breaking the Browser Language Barrier
Doppio: Breaking the Browser Language BarrierEmery Berger
 
Dthreads: Efficient Deterministic Multithreading
Dthreads: Efficient Deterministic MultithreadingDthreads: Efficient Deterministic Multithreading
Dthreads: Efficient Deterministic MultithreadingEmery Berger
 
Programming with People
Programming with PeopleProgramming with People
Programming with PeopleEmery Berger
 
DieHarder (CCS 2010, WOOT 2011)
DieHarder (CCS 2010, WOOT 2011)DieHarder (CCS 2010, WOOT 2011)
DieHarder (CCS 2010, WOOT 2011)Emery Berger
 
Operating Systems - Advanced File Systems
Operating Systems - Advanced File SystemsOperating Systems - Advanced File Systems
Operating Systems - Advanced File SystemsEmery Berger
 
Operating Systems - File Systems
Operating Systems - File SystemsOperating Systems - File Systems
Operating Systems - File SystemsEmery Berger
 
Operating Systems - Networks
Operating Systems - NetworksOperating Systems - Networks
Operating Systems - NetworksEmery Berger
 
Operating Systems - Queuing Systems
Operating Systems - Queuing SystemsOperating Systems - Queuing Systems
Operating Systems - Queuing SystemsEmery Berger
 
Operating Systems - Distributed Parallel Computing
Operating Systems - Distributed Parallel ComputingOperating Systems - Distributed Parallel Computing
Operating Systems - Distributed Parallel ComputingEmery Berger
 
Operating Systems - Concurrency
Operating Systems - ConcurrencyOperating Systems - Concurrency
Operating Systems - ConcurrencyEmery Berger
 
Operating Systems - Advanced Synchronization
Operating Systems - Advanced SynchronizationOperating Systems - Advanced Synchronization
Operating Systems - Advanced SynchronizationEmery Berger
 
Operating Systems - Synchronization
Operating Systems - SynchronizationOperating Systems - Synchronization
Operating Systems - SynchronizationEmery Berger
 
Processes and Threads
Processes and ThreadsProcesses and Threads
Processes and ThreadsEmery Berger
 
Virtual Memory and Paging
Virtual Memory and PagingVirtual Memory and Paging
Virtual Memory and PagingEmery Berger
 
Operating Systems - Virtual Memory
Operating Systems - Virtual MemoryOperating Systems - Virtual Memory
Operating Systems - Virtual MemoryEmery Berger
 
MC2: High-Performance Garbage Collection for Memory-Constrained Environments
MC2: High-Performance Garbage Collection for Memory-Constrained EnvironmentsMC2: High-Performance Garbage Collection for Memory-Constrained Environments
MC2: High-Performance Garbage Collection for Memory-Constrained EnvironmentsEmery Berger
 
Vam: A Locality-Improving Dynamic Memory Allocator
Vam: A Locality-Improving Dynamic Memory AllocatorVam: A Locality-Improving Dynamic Memory Allocator
Vam: A Locality-Improving Dynamic Memory AllocatorEmery Berger
 
Quantifying the Performance of Garbage Collection vs. Explicit Memory Management
Quantifying the Performance of Garbage Collection vs. Explicit Memory ManagementQuantifying the Performance of Garbage Collection vs. Explicit Memory Management
Quantifying the Performance of Garbage Collection vs. Explicit Memory ManagementEmery Berger
 
Garbage Collection without Paging
Garbage Collection without PagingGarbage Collection without Paging
Garbage Collection without PagingEmery Berger
 
DieHard: Probabilistic Memory Safety for Unsafe Languages
DieHard: Probabilistic Memory Safety for Unsafe LanguagesDieHard: Probabilistic Memory Safety for Unsafe Languages
DieHard: Probabilistic Memory Safety for Unsafe LanguagesEmery Berger
 

More from Emery Berger (20)

Doppio: Breaking the Browser Language Barrier
Doppio: Breaking the Browser Language BarrierDoppio: Breaking the Browser Language Barrier
Doppio: Breaking the Browser Language Barrier
 
Dthreads: Efficient Deterministic Multithreading
Dthreads: Efficient Deterministic MultithreadingDthreads: Efficient Deterministic Multithreading
Dthreads: Efficient Deterministic Multithreading
 
Programming with People
Programming with PeopleProgramming with People
Programming with People
 
DieHarder (CCS 2010, WOOT 2011)
DieHarder (CCS 2010, WOOT 2011)DieHarder (CCS 2010, WOOT 2011)
DieHarder (CCS 2010, WOOT 2011)
 
Operating Systems - Advanced File Systems
Operating Systems - Advanced File SystemsOperating Systems - Advanced File Systems
Operating Systems - Advanced File Systems
 
Operating Systems - File Systems
Operating Systems - File SystemsOperating Systems - File Systems
Operating Systems - File Systems
 
Operating Systems - Networks
Operating Systems - NetworksOperating Systems - Networks
Operating Systems - Networks
 
Operating Systems - Queuing Systems
Operating Systems - Queuing SystemsOperating Systems - Queuing Systems
Operating Systems - Queuing Systems
 
Operating Systems - Distributed Parallel Computing
Operating Systems - Distributed Parallel ComputingOperating Systems - Distributed Parallel Computing
Operating Systems - Distributed Parallel Computing
 
Operating Systems - Concurrency
Operating Systems - ConcurrencyOperating Systems - Concurrency
Operating Systems - Concurrency
 
Operating Systems - Advanced Synchronization
Operating Systems - Advanced SynchronizationOperating Systems - Advanced Synchronization
Operating Systems - Advanced Synchronization
 
Operating Systems - Synchronization
Operating Systems - SynchronizationOperating Systems - Synchronization
Operating Systems - Synchronization
 
Processes and Threads
Processes and ThreadsProcesses and Threads
Processes and Threads
 
Virtual Memory and Paging
Virtual Memory and PagingVirtual Memory and Paging
Virtual Memory and Paging
 
Operating Systems - Virtual Memory
Operating Systems - Virtual MemoryOperating Systems - Virtual Memory
Operating Systems - Virtual Memory
 
MC2: High-Performance Garbage Collection for Memory-Constrained Environments
MC2: High-Performance Garbage Collection for Memory-Constrained EnvironmentsMC2: High-Performance Garbage Collection for Memory-Constrained Environments
MC2: High-Performance Garbage Collection for Memory-Constrained Environments
 
Vam: A Locality-Improving Dynamic Memory Allocator
Vam: A Locality-Improving Dynamic Memory AllocatorVam: A Locality-Improving Dynamic Memory Allocator
Vam: A Locality-Improving Dynamic Memory Allocator
 
Quantifying the Performance of Garbage Collection vs. Explicit Memory Management
Quantifying the Performance of Garbage Collection vs. Explicit Memory ManagementQuantifying the Performance of Garbage Collection vs. Explicit Memory Management
Quantifying the Performance of Garbage Collection vs. Explicit Memory Management
 
Garbage Collection without Paging
Garbage Collection without PagingGarbage Collection without Paging
Garbage Collection without Paging
 
DieHard: Probabilistic Memory Safety for Unsafe Languages
DieHard: Probabilistic Memory Safety for Unsafe LanguagesDieHard: Probabilistic Memory Safety for Unsafe Languages
DieHard: Probabilistic Memory Safety for Unsafe Languages
 

Stabilizer: Statistically Sound Performance Evaluation

  • 1. STABILIZER [ASPLOS 2013] Emery Berger, Charlie Curtsinger Statistically Sound Performance Evaluation
  • 2. We all care about performance evaluation We’ve been doing it wrong STABILIZER Repeated runs and error bars not enough We’re not measuring what we thought
  • 3. changing a program changes its layout STABILIZER Memory layout affects performance STABILIZER eliminates the effect of layout no way to measure effect of change in isolation evaluation of LLVM’s optimizations with STABILIZER Case Studies enables sound performance evaluation We’ve been doing it wrong
  • 4. STABILIZER We’ve been doing it wrong changing a program changes its layout Memory layout affects performance STABILIZER eliminates the effect of layout no way to measure effect of change in isolation evaluation of LLVM’s optimizations with STABILIZER Case Studies enables sound performance evaluation
  • 5. A Unsound performance evaluation int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } asm("isync"); } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; }
  • 6. int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } A Unsound performance evaluation static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } asm("isync"); } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; }
  • 7. int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } A Unsound performance evaluation static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; size_t meaning_of_life=42; for (size_t i = 0; i < size; i += 32) { asm("icbi 0,%0" : : "r"(p)); p += 32; } for (size_t i = 16; i < size; i += 32) { asm("icbi 0,%0" : : "r"(p)); p += 32; } asm("isync"); } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; }
  • 8. int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } A Unsound performance evaluation static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; size_t meaning_of_life=42; for (size_t i = 0; i < size; i += 32) { asm("icbi 0,%0" : : "r"(p)); p += 32; } for (size_t i = 16; i < size; i += 32) { asm("icbi 0,%0" : : "r"(p)); p += 32; } asm("isync"); } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; }
  • 9. int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } A Unsound performance evaluation DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; size_t meaning_of_life=42; for (size_t i = 0; i < size; i += 32) { asm("icbi 0,%0" : : "r"(p)); p += 32; } for (size_t i = 16; i < size; i += 32) { asm("icbi 0,%0" : : "r"(p)); p += 32; } asm("isync"); }
  • 10. int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } A Unsound performance evaluation DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; size_t meaning_of_life=42; for (size_t i = 0; i < size; i += 32) { asm("icbi 0,%0" : : "r"(p)); p += 32; } for (size_t i = 16; i < size; i += 32) { asm("icbi 0,%0" : : "r"(p)); p += 32; } asm("isync"); }
  • 11. A′ int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } A Unsound performance evaluation DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; size_t meaning_of_life=42; for (size_t i = 0; i < size; i += 32) { asm("icbi 0,%0" : : "r"(p)); p += 32; } for (size_t i = 16; i < size; i += 32) { asm("icbi 0,%0" : : "r"(p)); p += 32; } asm("isync"); }
  • 12. A′A Unsound performance evaluation int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; size_t meaning_of_life=42; for (size_t i = 0; i < size; i += 32) { asm("icbi 0,%0" : : "r"(p)); p += 32; } for (size_t i = 16; i < size; i += 32) { asm("icbi 0,%0" : : "r"(p)); p += 32; } asm("isync"); }
  • 13. 0.00 0.25 0.50 0.75 1.00 85.0 87.5 90.0 92.5 95.0 Time (s) Numberofruns Version A A' A′A ×1 ×1 Which is faster?
  • 14. 0.00 0.25 0.50 0.75 1.00 85.0 87.5 90.0 92.5 95.0 Time (s) Numberofruns Version A A' Is faster than ?A′ A
  • 15. 0.00 0.25 0.50 0.75 1.00 85.0 87.5 90.0 92.5 95.0 Time (s) Numberofruns Version A A' Is faster than ?A′ A
  • 16. 0.00 0.25 0.50 0.75 1.00 85.0 87.5 90.0 92.5 95.0 Time (s) Numberofruns Version A A' Is faster than ?A′ A
  • 17. 0.00 0.25 0.50 0.75 1.00 85.0 87.5 90.0 92.5 95.0 Time (s) Numberofruns Version A A' Is faster than ?A′ A what about variance? 2.8% faster
  • 18. 0 5 10 15 85.0 87.5 90.0 92.5 95.0 Time (s) Numberofruns Version A A' Which is faster? A′A ×30 ×30
  • 19. 0 5 10 15 85.0 87.5 90.0 92.5 95.0 Time (s) Numberofruns Version A A' Is faster than ?A′ A still 2.8% faster
  • 20. Why is faster than ?A′ A
  • 21. Why is faster than ?A′ A int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } asm("isync"); } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; }
  • 22. Why is faster than ?A′ A int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } asm("isync"); } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } Was it the code change?
  • 23. Why is faster than ?A′ A int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } asm("isync"); } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } Or was it the new layout?
  • 24. Why is faster than ?A′ A int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } asm("isync"); } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } Mytkowicz et al. (ASPLOS’09) Layout biases measurement
  • 25. Layout biases measurement Mytkowicz et al. (ASPLOS’09) Link Order Environment Variable Size Changes function addresses Moves the program stack
  • 26. Layout biases measurement Mytkowicz et al. (ASPLOS’09) Link Order Environment Variable Size Changes function addresses Moves the program stack Larger than impact of -O3
  • 27. Blame the cache int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } asm("isync"); } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } A
  • 28. Blame the cache A int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } asm("isync"); } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } asm("isync"); } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } conflict map to same cache set
  • 29. A′ Blame the cache A int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } asm("isync"); } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; }
  • 30. Blame the cache A int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } asm("isync"); } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } map to same set A′ Nothing here no conflict
  • 31. A′ Blame the cache A int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } asm("isync"); } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } or branch predictor or TLB or prefetcher or branch target predictor
  • 32. Blame the hash A int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } int main(int argc, char **argv) { topFrame = (void**)__builtin_frame_address(0); setHandler(Trap::TrapSignal, onTrap); setHandler(SIGALRM, onTimer); setHandler(SIGSEGV, onFault); for(Function* f: functions) { f->setTrap(); } setTimer(interval); int r = stabilizer_main(argc, argv); return r; } void setTimer(int msec) { struct itimerval timer; timer.it_value.tv_sec = (msec - msec % 1000) / 1000; timer.it_value.tv_usec = 1000 * (msec % 1000); timer.it_interval.tv_sec = 0; timer.it_interval.tv_usec = 0; setitimer(ITIMER_REAL, &timer, 0); } static void flush_icache(void* begin, size_t size) { uintptr_t p = (uintptr_t)begin & ~15UL; for (size_t i = 0; i < size; i += 16) { asm("icbi 0,%0" : : "r"(p)); p += 16; } asm("isync"); } DataHeapType* getDataHeap() { static char buf[sizeof(DataHeapType)]; static DataHeapType* _theDataHeap = new (buf) DataHeapType; return _theDataHeap; } A′
  • 33. it’s faster it’s faster it’s faster it’s faster Is faster than ?A′ A Let’s do a poll
  • 34. Do we trust this? it’s faster it’s faster it’s faster it’s faster
  • 35. it’s faster it’s slower they’re the same Is faster than ?A′ A
  • 36. it’s faster it’s slower they’re the same But it ran faster! What if we only talk to Bob?
  • 37. it’s faster But it ran faster! What if we only use this layout? it’s slower they’re the same
  • 38. it’s faster But it ran faster! What if we only use this layout?
  • 39. it’s faster But it ran faster! What if we only use this layout? Upgrade libc Changes layout
  • 40. it’s faster But it ran faster! What if we only use this layout? Change Username Changes layout
  • 41. Layout is Brittle it’s faster But it ran faster! What if we only use this layout? Run in a new directory Changes layout
  • 42. Layout is Brittle But it ran faster! What if we only use this layout? Layout biases measurement Mytkowicz et al. (ASPLOS’09) Can we eliminate the effect of layout?
  • 43. But it ran faster! What if we only use this layout? Layout biases measurement Can we eliminate the effect of layout? YES
  • 44. STABILIZER Memory layout affects performance STABILIZER eliminates the effect of layout enables sound performance evaluation evaluation of LLVM’s optimizations with STABILIZER Case Studies makes performance evaluation difficult
  • 45. STABILIZER Memory layout affects performance STABILIZER eliminates the effect of layout enables sound performance evaluation evaluation of LLVM’s optimizations with STABILIZER Case Studies makes performance evaluation difficult
  • 46. Layout biases measurement STABILIZER function addresses stack frame sizes heap allocations randomizes layout
  • 47. STABILIZER randomizes layout function addresses stack frame sizes heap allocations repeatedly Layout biases measurement during execution
  • 48. STABILIZER randomizes layout function addresses stack frame sizes heap allocations repeatedly Layout biases measurement a completely random layout cannot bias results
  • 49. A′A ×30 ×30 Sound Performance Evaluation
  • 50. A ×30 ×30 0% 10% 20% 30% 40% 85.0 87.5 90.0 92.5 95.0 Time (s) PercentofObservedRuntimes A′ Sound Performance Evaluation
  • 51. Is faster than ?A′ A 0% 10% 20% 30% 40% 85.0 87.5 90.0 92.5 95.0 Time (s) PercentofObservedRuntimes
  • 52. Is faster than ?A 0% 10% 20% 30% 40% 85.0 87.5 90.0 92.5 95.0 Time (s) PercentofObservedRuntimes A′
  • 53. Is faster than ?A 0% 10% 20% 30% 40% 85.0 87.5 90.0 92.5 95.0 Time (s) PercentofObservedRuntimes A′
  • 54. Is faster than ?A 0% 10% 20% 30% 40% 85.0 87.5 90.0 92.5 95.0 Time (s) PercentofObservedRuntimes A′
  • 55. AA′ what is the probability of measuring a speedup this large by chance? The Statistical Approach If = hypothesis testing
  • 56. what is the probability of measuring a speedup this large by chance? easy to compute for the normal distribution AA′If =
  • 57. easy to compute for the normal distribution AA′If =
  • 58. STABILIZER randomizes layoutrepeatedly 0% 10% 20% 30% 40% 85.0 87.5 90.0 92.5 95.0 Time (s) PercentofObservedRuntimes what is the probability of measuring a speedup this large by chance? if there is a low probability
  • 59. STABILIZER randomizes layoutrepeatedly this speedup is real 0% 10% 20% 30% 40% 85.0 87.5 90.0 92.5 95.0 Time (s) PercentofObservedRuntimes not due to the effect on memory layout
  • 60. this speedup is real 0% 10% 20% 30% 40% 85.0 87.5 90.0 92.5 95.0 Time (s) PercentofObservedRuntimes Sound Performance Evaluation STABILIZER randomizes layoutrepeatedly what does re-randomization do? not due to the effect on memory layout
  • 61. 0.0 0.2 0.4 350 360 370 380 390 400 Time (s) ProbabilityDensity STABILIZER randomizes layoutrepeatedly one random layout per-run
  • 62. STABILIZER randomizes layoutrepeatedly 0.0 0.2 0.4 350 360 370 380 390 400 Time (s) ProbabilityDensity many random layouts in each run one random layout per-run
  • 63. STABILIZER generates a new random layout every ½ second Total execution time is the sum of all periods STABILIZER randomizes layoutrepeatedly
  • 64. STABILIZER generates a new random layout every ½ second Total execution time is the sum of all periods The sum of a sufficient number of independent, identically distributed random variables is approximately normally distributed. STABILIZER randomizes layoutrepeatedly
  • 65. STABILIZER generates a new random layout every ½ second Total execution time is the sum of all periods The sum of a sufficient number of independent, identically distributed random variables is approximately normally distributed. STABILIZER randomizes layoutrepeatedly
  • 66. STABILIZER generates a new random layout every ½ second Total execution time is the sum of all periods The sum of a sufficient number of independent, identically distributed random variables is approximately normally distributed. STABILIZER randomizes layoutrepeatedly
  • 67. Central Limit Theorem execution times are normally distributed The sum of a sufficient number of independent, identically distributed random variables is approximately normally distributed.
  • 68. STABILIZER Memory layout affects performance STABILIZER eliminates the effect of layout enables sound performance evaluation evaluation of LLVM’s optimizations with STABILIZER Case Studies makes performance evaluation difficult
  • 69. STABILIZER makes performance evaluation difficult Memory layout affects performance STABILIZER eliminates the effect of layout enables sound performance evaluation evaluation of LLVM’s optimizations with STABILIZER Case Studies
  • 70. Case Studies on each benchmark across the whole benchmark suite evaluation of LLVM’s optimizations with STABILIZER first, build benchmarks with STABILIZER
  • 72. > szc main.c Build programs with STABILIZER
  • 73. > szc main.c-Rcode Build programs with STABILIZER
  • 74. > szc main.c-Rcode -Rheap -Rstack Build programs with STABILIZER now run the benchmarks
  • 75. 0% 10% 20% 30% 40% 85.0 87.5 90.0 92.5 95.0 Time (s) PercentofObservedRuntimes Run benchmarks as usual A′A ×30 ×30 drop the results into R
  • 76. Is faster than ?A 0% 10% 20% 30% 40% 85.0 87.5 90.0 92.5 95.0 Time (s) PercentofObservedRuntimes A′
  • 77. A′ 0% 10% 20% 30% 40% 85.0 87.5 90.0 92.5 95.0 Time (s) PercentofObservedRuntimes If = A
  • 79. A′If = A what is the probability of measuring a difference at least this large?
  • 80. what is the probability of measuring a difference at least this large? A′ The Student’s t-test If = A t.test(times.A′, times.A)
  • 81. what is the probability of measuring a difference at least this large? A′If = A The Student’s t-test If p-value
  • 82. A′ The Student’s t-test If p-value ≤ 5% If = A 95% Confidence what is the probability of measuring a difference at least this large?
  • 83. If =A′ A The Student’s t-test If p-value ≤ 5% we reject the null hypothesis what is the probability of measuring a difference at least this large?
  • 84. ≠ A The Student’s t-test If p-value ≤ 5% we reject the null hypothesis Random chance not responsible for the measured difference A′
  • 85. ≠ A The difference is real A′
  • 91. Speedup of -O3 over -O2 0.0% 0.5% 1.0% 1.5% bzip2gobm kzeusm p libquantum w rf astar m cfhm m er m ilc nam d gcc lbmgrom acsh264ref cactusA DMperlbenchsphinx3 sjeng Speedup Significant Yes No
  • 97. -O2 -O3 ×30 ×30lbm lbm ×30 ×30 Comparing optimizations
  • 98. -O2 -O3 ×30 ×30lbm lbm ×30 ×30astar astar ×30 ×30 Comparing optimizations
  • 99. -O2 -O3 ×30 ×30lbm lbm ×30 ×30astar astar ... ×30 ×30 Comparing optimizations
  • 100. -O2 -O3×30 ×30 0% 1% 2% 3% 4% 0 25 50 75 100 Time (s) PercentofObservedRuntimes Version −O2 −O3 Comparing optimizations
  • 101. -O2-O3 0% 1% 2% 3% 4% 0 25 50 75 100 Time (s) PercentofObservedRuntimes Version −O2 −O3 Is faster than ?
  • 102. -O2-O3 0% 1% 2% 3% 4% 0 25 50 75 100 Time (s) PercentofObservedRuntimes Version −O2 −O3 Is faster than ?
  • 103. -O2-O3 0% 1% 2% 3% 4% 0 25 50 75 100 Time (s) PercentofObservedRuntimes Version −O2 −O3 Is faster than ?
  • 104. -O2-O3 0% 1% 2% 3% 4% 0 25 50 75 100 Time (s) PercentofObservedRuntimes Version −O2 −O3 Is faster than ?
  • 105. If = -O2-O3 0% 1% 2% 3% 4% 0 25 50 75 100 Time (s) PercentofObservedRuntimes Version −O2 −O3
  • 107. If = -O2-O3 1% 2% 3% 4% PercentofObservedRuntimes Version −O2 −O3 what is the probability of measuring these differences?
  • 108. If = -O2-O3 4% imes what is the probability of measuring these differences? Analysis ofVariance aov(time~opt+Error(benchmark/opt), times)
  • 109. If = -O2-O3 4% imes what is the probability of measuring these differences? Analysis ofVariance If p-value
  • 110. If = -O2-O3 4% imes what is the probability of measuring these differences? Analysis ofVariance If p-value ≤ 5%
  • 111. If = -O2-O3 4% imes what is the probability of measuring these differences? Analysis ofVariance If p-value ≤ 5% we reject the null hypothesis
  • 112. Analysis ofVariance If p-value ≤ 5% we reject the null hypothesis p-value = 26.4% -O3 -O2vs are we 73.6% confident? one in four experiments will show an effect that does not exist!
  • 113. Analysis ofVariance If p-value ≤ 5% we reject the null hypothesis p-value = 26.4% fail to reject the null hypothesis -O3 -O2vs
  • 114. -O3 Analysis ofVariance we reject the null hypothesis -O2The effect of over is indistinguishable from noise If p-value ≤ 5% Did STABILIZER hide the effect?
  • 115. Runtime with -O2 Runtime with -O3 Runtime with -O1 Runtime with -O0 Execution Time Did STABILIZER hide the effect?
  • 116. STABILIZER STABILIZER STABILIZERSTABILIZER Execution Time Runtime with -O2 Runtime with -O3 Runtime with -O1 Runtime with -O0 speedups Did STABILIZER hide the effect?
  • 117. Runtime with -O2 Runtime with -O3 Runtime with -O1 Runtime with -O0 STABILIZER STABILIZER STABILIZERSTABILIZER Execution Time speedups Did STABILIZER hide the effect?
  • 119. STABILIZER Memory layout affects performance STABILIZER eliminates the effect of layout showed that -O3 does not have a statistically significant effect across our benchmarks Case Studies random layout enables sound performance evaluation makes performance evaluation difficult
  • 122. LLVM’s Optimizations • Function Inlining • Unroll Loops • GlobalValue Numbering • Dead Global Elimination • Merge Duplicate Global Constants • Argument Promotion • Global CSE • Scalar Replacement of Aggregates -O2 -O3
  • 124. STABILIZER randomizes stack frame size argv argc locals arguments main
  • 125. STABILIZER randomizes stack frame size argv argc locals arguments main return address frame ptr locals arguments foo
  • 126. STABILIZER randomizes stack frame size argv argc locals arguments main return address frame ptr locals arguments foo return address frame ptr bar
  • 128. return address frame ptr locals locals arguments main return address frame ptr locals arguments foo bar STABILIZER randomizes stack frame size
  • 129. return address frame ptr locals locals arguments main return address frame ptr locals arguments foo bar STABILIZER randomizes stack frame size Random Padding
  • 136. and re-randomizes during execution Randomizes placement of heap objects stack frames functions
  • 143. 0.0 0.2 0.4 0.6 350 360 370 380 390 400 Time (s) ProbabilityDensity
  • 144. 0.0 0.2 0.4 0.6 350 360 370 380 390 400 Time (s) ProbabilityDensity
  • 145. 0.0 0.2 0.4 0.6 350 360 370 380 390 400 Time (s) ProbabilityDensity
  • 146. 0.0 0.2 0.4 0.6 350 360 370 380 390 400 Time (s) ProbabilityDensity
  • 147. 0.0 0.2 0.4 0.6 350 360 370 380 390 400 Time (s) ProbabilityDensity
  • 149. Programs with phases A B A B AA B A B A Execution Time
  • 150. A B A B A Execution Time A A A B B ~ N(μA, σA) Programs with phases ~ N(μB, σB)
  • 151. A B A B A Execution Time A A A B B ~ N(μA, σA) Programs with phases ~ N(μB, σB) ~ A + B
  • 152. A B A B A Execution Time Programs with phases ~ N(μA, σA) + N(μB, σB) ~ A + B = N(μA+μB, σA+σB)