SlideShare une entreprise Scribd logo
1  sur  34
Better performance
through Superscalarity
Mårten Rånge
How many GigaFlops?
i5 6600K 3.5 GHz
(4x cores)
~224 GigaFlops
64 Flops/cycle
Zn+1 = Zn
2 + C (1)
Z0 = C (2)
(x,y)
(x,y) + (c,d)
(x+c,y+d)
(x,y)2
(x2 - y2,2xy)
r
aZk
Z0
2
2a
r2
Z1 = Z0
2 + C
C
|R| = 2
Zl
Zm
Z0
Zn+1 = Zn
2 + C
constexpr auto max_iter = 50U;
auto mandelbrot (double cx, double cy) {
auto x = cx ;
auto y = cy ;
auto iter = max_iter;
for (; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
if (x2 + y2 > 4) return iter;
y = 2*x*y + cy ;
x = x2 - y2 + cx ;
}
return iter;
}
r2 = x2 + y2
y
x
r
(x,y)2 = (x2 - y2,2xy)
Zn+1 = Zn
2 + C
SIMD
a = b+c
(a0,a1)=(b0,b1)+(c0,c1)
0 1 2 3
4 5 6 7
4 6 8 10
+
AVX
8 flops/instruction
constexpr auto max_iter = 50U;
auto mandelbrot (double cx, double cy) {
auto x = cx ;
auto y = cy ;
auto iter = max_iter;
for (; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
if (x2 + y2 > 4) return iter;
y = 2*x*y + cy ;
x = x2 - y2 + cx ;
}
return iter;
}
auto mandelbrot (__m256 cx, __m256 cy) {
auto x = cx;
auto y = cy;
int cmp_mask = 0 ;
for (auto iter = max_iter; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
auto r2 = x2 + y2;
auto _4 = float8 (4.0F);
cmp_mask = r2 <= _4;
if (!cmp_mask) return 0;
auto xy = x*y;
y = xy + xy + cy;
x = x2 - y2 + cx;
}
return cmp_mask;
}
Minimize CPU stalls
opcode Latency Throughput
vmulps 5 1
vaddps 3 1
vsubps 3 1
vcmpps 3 1
vmovmskps 1 1
Task<float>
auto mandelbrot (__m256 cx, __m256 cy) {
auto x = cx;
auto y = cy;
int cmp_mask = 0 ;
for (auto iter = max_iter; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
auto r2 = x2 + y2;
auto _4 = float8 (4.0F);
cmp_mask = r2 <= _4;
if (!cmp_mask) return 0;
auto xy = x*y;
y = xy + xy + cy;
x = x2 - y2 + cx;
}
return cmp_mask;
}
x2[0] = x[0]*x[0];
y2[0] = y[0]*y[0];
r2[0] = x2[0] + y2[0];
x2[1] = x[1]*x[1];
y2[1] = y[1]*y[1];
r2[1] = x2[1] + y2[1];
auto _4 = float8 (4.0);
cmp_mask = r2[0] <= _4 | ((r2[1] <= _4) << 8);
x2[0] = x[0]*x[0];
y2[0] = y[0]*y[0];
r2[0] = x2[0] + y2[0];
x2[1] = x[1]*x[1];
y2[1] = y[1]*y[1];
r2[1] = x2[1] + y2[1];
r2[0] = x2[0] + y2[0];
auto _4 = float8 (4.0);
cmp_mask = r2[0] <= _4 | ((r2[1] <= _4) << 8);
x2[0] = x[0]*x[0]
y2[0] = y[0]*y[0]
r2[0] = x2[0]+y2[0]
x2[1] = x[1]*x[1]
y2[1] = y[1]*y[1]
r2[1] = x2[1]+y2[1]
Instructionqueue
FU
x2[0]
y2[0]
r2[0]
x2[1]
y2[1]
r2[1]
Resultqueue
Shouldn’t compilers
do this for us?
constexpr auto max_iter = 50U;
auto mandelbrot (double cx, double cy) {
auto x = cx ;
auto y = cy ;
auto iter = max_iter;
for (; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
if (x2 + y2 > 4) return iter;
y = 2*x*y + cy ;
x = x2 - y2 + cx ;
}
return iter;
}
auto mandelbrot (__m256 cx, __m256 cy) {
auto x = cx;
auto y = cy;
int cmp_mask = 0 ;
for (auto iter = max_iter; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
auto r2 = x2 + y2;
auto _4 = float8 (4.0F);
cmp_mask = r2 <= _4;
if (!cmp_mask) return 0;
auto xy = x*y;
y = xy + xy + cy;
x = x2 - y2 + cx;
}
return cmp_mask;
}
Uses the mathematical properties of mandelbrot
Uses knowledge that inf and NaN <= 4 is false
AVX512
&
Hyper-threading
constexpr auto max_iter = 50U;
auto mandelbrot (double cx, double cy) {
auto x = cx ;
auto y = cy ;
auto iter = max_iter;
for (; iter > 0; --iter) {
auto x2 = x*x;
auto y2 = y*y;
if (x2 + y2 > 4) return iter;
y = 2*x*y + cy ;
x = x2 - y2 + cx ;
}
return iter;
}
Questions?

Contenu connexe

Tendances (17)

JavaScript - Agora nervoso
JavaScript - Agora nervosoJavaScript - Agora nervoso
JavaScript - Agora nervoso
 
Vcs23
Vcs23Vcs23
Vcs23
 
ECMAScript 6 major changes
ECMAScript 6 major changesECMAScript 6 major changes
ECMAScript 6 major changes
 
ARM 7 LPC 2148 lecture
ARM 7 LPC 2148 lectureARM 7 LPC 2148 lecture
ARM 7 LPC 2148 lecture
 
Wap in c to draw a line using DDA algorithm
Wap in c to draw a line using DDA algorithmWap in c to draw a line using DDA algorithm
Wap in c to draw a line using DDA algorithm
 
El
ElEl
El
 
Gaztea Tech Robotica 2016
Gaztea Tech Robotica 2016Gaztea Tech Robotica 2016
Gaztea Tech Robotica 2016
 
Computer graphics programs in c++
Computer graphics programs in c++Computer graphics programs in c++
Computer graphics programs in c++
 
10CSL67 CG LAB PROGRAM 10
10CSL67 CG LAB PROGRAM 1010CSL67 CG LAB PROGRAM 10
10CSL67 CG LAB PROGRAM 10
 
Senior design project code for PPG
Senior design project code for PPGSenior design project code for PPG
Senior design project code for PPG
 
Ssaw08 0624
Ssaw08 0624Ssaw08 0624
Ssaw08 0624
 
Numerical Method Assignment
Numerical Method AssignmentNumerical Method Assignment
Numerical Method Assignment
 
OOXX
OOXXOOXX
OOXX
 
Vcs9
Vcs9Vcs9
Vcs9
 
Snake.c
Snake.cSnake.c
Snake.c
 
When RV Meets CEP (RV 2016 Tutorial)
When RV Meets CEP (RV 2016 Tutorial)When RV Meets CEP (RV 2016 Tutorial)
When RV Meets CEP (RV 2016 Tutorial)
 
Oprerator overloading
Oprerator overloadingOprerator overloading
Oprerator overloading
 

Similaire à Better performance through Superscalarity

Write Python for Speed
Write Python for SpeedWrite Python for Speed
Write Python for SpeedYung-Yu Chen
 
Cocos2d Performance Tips
Cocos2d Performance TipsCocos2d Performance Tips
Cocos2d Performance TipsKeisuke Hata
 
COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docx
COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docxCOMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docx
COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docxTashiBhutia12
 
PBL1-v1-002j.pptx
PBL1-v1-002j.pptxPBL1-v1-002j.pptx
PBL1-v1-002j.pptxNAIST
 
Coscup2021 - useful abstractions at rust and it's practical usage
Coscup2021 - useful abstractions at rust and it's practical usageCoscup2021 - useful abstractions at rust and it's practical usage
Coscup2021 - useful abstractions at rust and it's practical usageWayne Tsai
 
include ltiostreamgt include ltstringgt include .pdf
include ltiostreamgt include ltstringgt include .pdfinclude ltiostreamgt include ltstringgt include .pdf
include ltiostreamgt include ltstringgt include .pdfcontact32
 
06 Recursion in C.pptx
06 Recursion in C.pptx06 Recursion in C.pptx
06 Recursion in C.pptxMouDhara1
 
Ejerciciosderivadasresueltos
EjerciciosderivadasresueltosEjerciciosderivadasresueltos
Ejerciciosderivadasresueltosbellidomates
 
C c++-meetup-1nov2017-autofdo
C c++-meetup-1nov2017-autofdoC c++-meetup-1nov2017-autofdo
C c++-meetup-1nov2017-autofdoKim Phillips
 
Computer graphics lab manual
Computer graphics lab manualComputer graphics lab manual
Computer graphics lab manualUma mohan
 
All VLSI programs
All VLSI programsAll VLSI programs
All VLSI programsGouthaman V
 
Writing MySQL User-defined Functions in JavaScript
Writing MySQL User-defined Functions in JavaScriptWriting MySQL User-defined Functions in JavaScript
Writing MySQL User-defined Functions in JavaScriptRoland Bouman
 

Similaire à Better performance through Superscalarity (20)

Write Python for Speed
Write Python for SpeedWrite Python for Speed
Write Python for Speed
 
Boosting Developer Productivity with Clang
Boosting Developer Productivity with ClangBoosting Developer Productivity with Clang
Boosting Developer Productivity with Clang
 
Vcs16
Vcs16Vcs16
Vcs16
 
Cocos2d Performance Tips
Cocos2d Performance TipsCocos2d Performance Tips
Cocos2d Performance Tips
 
COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docx
COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docxCOMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docx
COMPAPPABCA49085rFunrAP__Practical Number 9 & 10.docx
 
SCIPY-SYMPY.pdf
SCIPY-SYMPY.pdfSCIPY-SYMPY.pdf
SCIPY-SYMPY.pdf
 
PBL1-v1-002j.pptx
PBL1-v1-002j.pptxPBL1-v1-002j.pptx
PBL1-v1-002j.pptx
 
Coscup2021 - useful abstractions at rust and it's practical usage
Coscup2021 - useful abstractions at rust and it's practical usageCoscup2021 - useful abstractions at rust and it's practical usage
Coscup2021 - useful abstractions at rust and it's practical usage
 
include ltiostreamgt include ltstringgt include .pdf
include ltiostreamgt include ltstringgt include .pdfinclude ltiostreamgt include ltstringgt include .pdf
include ltiostreamgt include ltstringgt include .pdf
 
PRACTICAL COMPUTING
PRACTICAL COMPUTINGPRACTICAL COMPUTING
PRACTICAL COMPUTING
 
Ocr code
Ocr codeOcr code
Ocr code
 
C# Assignmet Help
C# Assignmet HelpC# Assignmet Help
C# Assignmet Help
 
06 Recursion in C.pptx
06 Recursion in C.pptx06 Recursion in C.pptx
06 Recursion in C.pptx
 
Ejerciciosderivadasresueltos
EjerciciosderivadasresueltosEjerciciosderivadasresueltos
Ejerciciosderivadasresueltos
 
C c++-meetup-1nov2017-autofdo
C c++-meetup-1nov2017-autofdoC c++-meetup-1nov2017-autofdo
C c++-meetup-1nov2017-autofdo
 
Computer graphics lab manual
Computer graphics lab manualComputer graphics lab manual
Computer graphics lab manual
 
All VLSI programs
All VLSI programsAll VLSI programs
All VLSI programs
 
Guia edo todas
Guia edo todasGuia edo todas
Guia edo todas
 
Integral table
Integral tableIntegral table
Integral table
 
Writing MySQL User-defined Functions in JavaScript
Writing MySQL User-defined Functions in JavaScriptWriting MySQL User-defined Functions in JavaScript
Writing MySQL User-defined Functions in JavaScript
 

Plus de Mårten Rånge

Know your FOSS obligations
Know your FOSS obligationsKnow your FOSS obligations
Know your FOSS obligationsMårten Rånge
 
Ray Marching Explained
Ray Marching ExplainedRay Marching Explained
Ray Marching ExplainedMårten Rånge
 
Monad - a functional design pattern
Monad - a functional design patternMonad - a functional design pattern
Monad - a functional design patternMårten Rånge
 
Pragmatic metaprogramming
Pragmatic metaprogrammingPragmatic metaprogramming
Pragmatic metaprogrammingMårten Rånge
 
Concurrency - responsiveness in .NET
Concurrency - responsiveness in .NETConcurrency - responsiveness in .NET
Concurrency - responsiveness in .NETMårten Rånge
 
Concurrency scalability
Concurrency scalabilityConcurrency scalability
Concurrency scalabilityMårten Rånge
 

Plus de Mårten Rånge (10)

Know your FOSS obligations
Know your FOSS obligationsKnow your FOSS obligations
Know your FOSS obligations
 
Ray Marching Explained
Ray Marching ExplainedRay Marching Explained
Ray Marching Explained
 
Property Based Tesing
Property Based TesingProperty Based Tesing
Property Based Tesing
 
Monad - a functional design pattern
Monad - a functional design patternMonad - a functional design pattern
Monad - a functional design pattern
 
Formlets
FormletsFormlets
Formlets
 
Pragmatic metaprogramming
Pragmatic metaprogrammingPragmatic metaprogramming
Pragmatic metaprogramming
 
Concurrency - responsiveness in .NET
Concurrency - responsiveness in .NETConcurrency - responsiveness in .NET
Concurrency - responsiveness in .NET
 
Meta Programming
Meta ProgrammingMeta Programming
Meta Programming
 
Concurrency scalability
Concurrency scalabilityConcurrency scalability
Concurrency scalability
 
Concurrency
ConcurrencyConcurrency
Concurrency
 

Dernier

What is DBT - The Ultimate Data Build Tool.pdf
What is DBT - The Ultimate Data Build Tool.pdfWhat is DBT - The Ultimate Data Build Tool.pdf
What is DBT - The Ultimate Data Build Tool.pdfMounikaPolabathina
 
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptx
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptxMerck Moving Beyond Passwords: FIDO Paris Seminar.pptx
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptxLoriGlavin3
 
The Future Roadmap for the Composable Data Stack - Wes McKinney - Data Counci...
The Future Roadmap for the Composable Data Stack - Wes McKinney - Data Counci...The Future Roadmap for the Composable Data Stack - Wes McKinney - Data Counci...
The Future Roadmap for the Composable Data Stack - Wes McKinney - Data Counci...Wes McKinney
 
TeamStation AI System Report LATAM IT Salaries 2024
TeamStation AI System Report LATAM IT Salaries 2024TeamStation AI System Report LATAM IT Salaries 2024
TeamStation AI System Report LATAM IT Salaries 2024Lonnie McRorey
 
From Family Reminiscence to Scholarly Archive .
From Family Reminiscence to Scholarly Archive .From Family Reminiscence to Scholarly Archive .
From Family Reminiscence to Scholarly Archive .Alan Dix
 
Digital Identity is Under Attack: FIDO Paris Seminar.pptx
Digital Identity is Under Attack: FIDO Paris Seminar.pptxDigital Identity is Under Attack: FIDO Paris Seminar.pptx
Digital Identity is Under Attack: FIDO Paris Seminar.pptxLoriGlavin3
 
Unleashing Real-time Insights with ClickHouse_ Navigating the Landscape in 20...
Unleashing Real-time Insights with ClickHouse_ Navigating the Landscape in 20...Unleashing Real-time Insights with ClickHouse_ Navigating the Landscape in 20...
Unleashing Real-time Insights with ClickHouse_ Navigating the Landscape in 20...Alkin Tezuysal
 
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptxThe Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptxLoriGlavin3
 
TrustArc Webinar - How to Build Consumer Trust Through Data Privacy
TrustArc Webinar - How to Build Consumer Trust Through Data PrivacyTrustArc Webinar - How to Build Consumer Trust Through Data Privacy
TrustArc Webinar - How to Build Consumer Trust Through Data PrivacyTrustArc
 
Potential of AI (Generative AI) in Business: Learnings and Insights
Potential of AI (Generative AI) in Business: Learnings and InsightsPotential of AI (Generative AI) in Business: Learnings and Insights
Potential of AI (Generative AI) in Business: Learnings and InsightsRavi Sanghani
 
New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024BookNet Canada
 
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyes
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyesHow to Effectively Monitor SD-WAN and SASE Environments with ThousandEyes
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyesThousandEyes
 
Testing tools and AI - ideas what to try with some tool examples
Testing tools and AI - ideas what to try with some tool examplesTesting tools and AI - ideas what to try with some tool examples
Testing tools and AI - ideas what to try with some tool examplesKari Kakkonen
 
DevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platformsDevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platformsSergiu Bodiu
 
Decarbonising Buildings: Making a net-zero built environment a reality
Decarbonising Buildings: Making a net-zero built environment a realityDecarbonising Buildings: Making a net-zero built environment a reality
Decarbonising Buildings: Making a net-zero built environment a realityIES VE
 
The Fit for Passkeys for Employee and Consumer Sign-ins: FIDO Paris Seminar.pptx
The Fit for Passkeys for Employee and Consumer Sign-ins: FIDO Paris Seminar.pptxThe Fit for Passkeys for Employee and Consumer Sign-ins: FIDO Paris Seminar.pptx
The Fit for Passkeys for Employee and Consumer Sign-ins: FIDO Paris Seminar.pptxLoriGlavin3
 
2024 April Patch Tuesday
2024 April Patch Tuesday2024 April Patch Tuesday
2024 April Patch TuesdayIvanti
 
A Framework for Development in the AI Age
A Framework for Development in the AI AgeA Framework for Development in the AI Age
A Framework for Development in the AI AgeCprime
 
Connecting the Dots for Information Discovery.pdf
Connecting the Dots for Information Discovery.pdfConnecting the Dots for Information Discovery.pdf
Connecting the Dots for Information Discovery.pdfNeo4j
 
Genislab builds better products and faster go-to-market with Lean project man...
Genislab builds better products and faster go-to-market with Lean project man...Genislab builds better products and faster go-to-market with Lean project man...
Genislab builds better products and faster go-to-market with Lean project man...Farhan Tariq
 

Dernier (20)

What is DBT - The Ultimate Data Build Tool.pdf
What is DBT - The Ultimate Data Build Tool.pdfWhat is DBT - The Ultimate Data Build Tool.pdf
What is DBT - The Ultimate Data Build Tool.pdf
 
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptx
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptxMerck Moving Beyond Passwords: FIDO Paris Seminar.pptx
Merck Moving Beyond Passwords: FIDO Paris Seminar.pptx
 
The Future Roadmap for the Composable Data Stack - Wes McKinney - Data Counci...
The Future Roadmap for the Composable Data Stack - Wes McKinney - Data Counci...The Future Roadmap for the Composable Data Stack - Wes McKinney - Data Counci...
The Future Roadmap for the Composable Data Stack - Wes McKinney - Data Counci...
 
TeamStation AI System Report LATAM IT Salaries 2024
TeamStation AI System Report LATAM IT Salaries 2024TeamStation AI System Report LATAM IT Salaries 2024
TeamStation AI System Report LATAM IT Salaries 2024
 
From Family Reminiscence to Scholarly Archive .
From Family Reminiscence to Scholarly Archive .From Family Reminiscence to Scholarly Archive .
From Family Reminiscence to Scholarly Archive .
 
Digital Identity is Under Attack: FIDO Paris Seminar.pptx
Digital Identity is Under Attack: FIDO Paris Seminar.pptxDigital Identity is Under Attack: FIDO Paris Seminar.pptx
Digital Identity is Under Attack: FIDO Paris Seminar.pptx
 
Unleashing Real-time Insights with ClickHouse_ Navigating the Landscape in 20...
Unleashing Real-time Insights with ClickHouse_ Navigating the Landscape in 20...Unleashing Real-time Insights with ClickHouse_ Navigating the Landscape in 20...
Unleashing Real-time Insights with ClickHouse_ Navigating the Landscape in 20...
 
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptxThe Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
The Role of FIDO in a Cyber Secure Netherlands: FIDO Paris Seminar.pptx
 
TrustArc Webinar - How to Build Consumer Trust Through Data Privacy
TrustArc Webinar - How to Build Consumer Trust Through Data PrivacyTrustArc Webinar - How to Build Consumer Trust Through Data Privacy
TrustArc Webinar - How to Build Consumer Trust Through Data Privacy
 
Potential of AI (Generative AI) in Business: Learnings and Insights
Potential of AI (Generative AI) in Business: Learnings and InsightsPotential of AI (Generative AI) in Business: Learnings and Insights
Potential of AI (Generative AI) in Business: Learnings and Insights
 
New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
New from BookNet Canada for 2024: Loan Stars - Tech Forum 2024
 
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyes
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyesHow to Effectively Monitor SD-WAN and SASE Environments with ThousandEyes
How to Effectively Monitor SD-WAN and SASE Environments with ThousandEyes
 
Testing tools and AI - ideas what to try with some tool examples
Testing tools and AI - ideas what to try with some tool examplesTesting tools and AI - ideas what to try with some tool examples
Testing tools and AI - ideas what to try with some tool examples
 
DevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platformsDevEX - reference for building teams, processes, and platforms
DevEX - reference for building teams, processes, and platforms
 
Decarbonising Buildings: Making a net-zero built environment a reality
Decarbonising Buildings: Making a net-zero built environment a realityDecarbonising Buildings: Making a net-zero built environment a reality
Decarbonising Buildings: Making a net-zero built environment a reality
 
The Fit for Passkeys for Employee and Consumer Sign-ins: FIDO Paris Seminar.pptx
The Fit for Passkeys for Employee and Consumer Sign-ins: FIDO Paris Seminar.pptxThe Fit for Passkeys for Employee and Consumer Sign-ins: FIDO Paris Seminar.pptx
The Fit for Passkeys for Employee and Consumer Sign-ins: FIDO Paris Seminar.pptx
 
2024 April Patch Tuesday
2024 April Patch Tuesday2024 April Patch Tuesday
2024 April Patch Tuesday
 
A Framework for Development in the AI Age
A Framework for Development in the AI AgeA Framework for Development in the AI Age
A Framework for Development in the AI Age
 
Connecting the Dots for Information Discovery.pdf
Connecting the Dots for Information Discovery.pdfConnecting the Dots for Information Discovery.pdf
Connecting the Dots for Information Discovery.pdf
 
Genislab builds better products and faster go-to-market with Lean project man...
Genislab builds better products and faster go-to-market with Lean project man...Genislab builds better products and faster go-to-market with Lean project man...
Genislab builds better products and faster go-to-market with Lean project man...
 

Better performance through Superscalarity

  • 2. How many GigaFlops? i5 6600K 3.5 GHz (4x cores)
  • 5.
  • 6. Zn+1 = Zn 2 + C (1) Z0 = C (2)
  • 12. r aZk Z0 2 2a r2 Z1 = Z0 2 + C C |R| = 2 Zl Zm Z0 Zn+1 = Zn 2 + C
  • 13.
  • 14. constexpr auto max_iter = 50U; auto mandelbrot (double cx, double cy) { auto x = cx ; auto y = cy ; auto iter = max_iter; for (; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; if (x2 + y2 > 4) return iter; y = 2*x*y + cy ; x = x2 - y2 + cx ; } return iter; } r2 = x2 + y2 y x r (x,y)2 = (x2 - y2,2xy) Zn+1 = Zn 2 + C
  • 15. SIMD
  • 18. 0 1 2 3 4 5 6 7 4 6 8 10 +
  • 20. constexpr auto max_iter = 50U; auto mandelbrot (double cx, double cy) { auto x = cx ; auto y = cy ; auto iter = max_iter; for (; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; if (x2 + y2 > 4) return iter; y = 2*x*y + cy ; x = x2 - y2 + cx ; } return iter; }
  • 21. auto mandelbrot (__m256 cx, __m256 cy) { auto x = cx; auto y = cy; int cmp_mask = 0 ; for (auto iter = max_iter; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; auto r2 = x2 + y2; auto _4 = float8 (4.0F); cmp_mask = r2 <= _4; if (!cmp_mask) return 0; auto xy = x*y; y = xy + xy + cy; x = x2 - y2 + cx; } return cmp_mask; }
  • 23. opcode Latency Throughput vmulps 5 1 vaddps 3 1 vsubps 3 1 vcmpps 3 1 vmovmskps 1 1
  • 25. auto mandelbrot (__m256 cx, __m256 cy) { auto x = cx; auto y = cy; int cmp_mask = 0 ; for (auto iter = max_iter; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; auto r2 = x2 + y2; auto _4 = float8 (4.0F); cmp_mask = r2 <= _4; if (!cmp_mask) return 0; auto xy = x*y; y = xy + xy + cy; x = x2 - y2 + cx; } return cmp_mask; }
  • 26. x2[0] = x[0]*x[0]; y2[0] = y[0]*y[0]; r2[0] = x2[0] + y2[0]; x2[1] = x[1]*x[1]; y2[1] = y[1]*y[1]; r2[1] = x2[1] + y2[1]; auto _4 = float8 (4.0); cmp_mask = r2[0] <= _4 | ((r2[1] <= _4) << 8);
  • 27. x2[0] = x[0]*x[0]; y2[0] = y[0]*y[0]; r2[0] = x2[0] + y2[0]; x2[1] = x[1]*x[1]; y2[1] = y[1]*y[1]; r2[1] = x2[1] + y2[1]; r2[0] = x2[0] + y2[0]; auto _4 = float8 (4.0); cmp_mask = r2[0] <= _4 | ((r2[1] <= _4) << 8);
  • 28. x2[0] = x[0]*x[0] y2[0] = y[0]*y[0] r2[0] = x2[0]+y2[0] x2[1] = x[1]*x[1] y2[1] = y[1]*y[1] r2[1] = x2[1]+y2[1] Instructionqueue FU x2[0] y2[0] r2[0] x2[1] y2[1] r2[1] Resultqueue
  • 30. constexpr auto max_iter = 50U; auto mandelbrot (double cx, double cy) { auto x = cx ; auto y = cy ; auto iter = max_iter; for (; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; if (x2 + y2 > 4) return iter; y = 2*x*y + cy ; x = x2 - y2 + cx ; } return iter; }
  • 31. auto mandelbrot (__m256 cx, __m256 cy) { auto x = cx; auto y = cy; int cmp_mask = 0 ; for (auto iter = max_iter; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; auto r2 = x2 + y2; auto _4 = float8 (4.0F); cmp_mask = r2 <= _4; if (!cmp_mask) return 0; auto xy = x*y; y = xy + xy + cy; x = x2 - y2 + cx; } return cmp_mask; } Uses the mathematical properties of mandelbrot Uses knowledge that inf and NaN <= 4 is false
  • 33. constexpr auto max_iter = 50U; auto mandelbrot (double cx, double cy) { auto x = cx ; auto y = cy ; auto iter = max_iter; for (; iter > 0; --iter) { auto x2 = x*x; auto y2 = y*y; if (x2 + y2 > 4) return iter; y = 2*x*y + cy ; x = x2 - y2 + cx ; } return iter; }