SlideShare une entreprise Scribd logo
1  sur  39
Télécharger pour lire hors ligne
High-Level Language Features for Low-Level Programming


Cyrus Omar
Computer Science Department
Carnegie Mellon University
http://www.cs.cmu.edu/~comar/
The Needs of Scientists and Engineers




ment group) or letting the users/stakeholders know how the                                                                                                             100

software works (open source, scientific paper publication).
            Other                                                                                                                                                               Using Tool is ’Standard’
                                                                                                                                                                           80
      Reason given for use of programming language




                                                         Only language known
3.8                                                  Non-functional requirements




                                                                                                                                                   % of respondents
                                                                                                                                                                                Tool is Easy to Use
                                                           Required                                                                                                        60
  The respondents were asked to rate a series of non-functional                                                                                                                                Open Source




                                                                                                                                      Reason for use of tools
                  Favourite
requirements on the following Likert scale:                                                                                                                                40
                                                                               Performance                                                                                                                     Cost (or lack thereof) of Tool

                                                                                                                                                                           20
  1. very unimportant                                                          Legacy                                                                                                                               Project Organisation

                                                                                       Ease of use                                                                          0
  2. unimportant                                                                                                                                                                                                                       Features




                                                                                                                                                                                 Reliability


                                                                                                                                                                                               Functionality


                                                                                                                                                                                                                  Usability


                                                                                                                                                                                                                              Availability


                                                                                                                                                                                                                                             Flexibility


                                                                                                                                                                                                                                                           Performance*


                                                                                                                                                                                                                                                                            Portability


                                                                                                                                                                                                                                                                                          Testability


                                                                                                                                                                                                                                                                                                        Maintainability


                                                                                                                                                                                                                                                                                                                              Tracability*


                                                                                                                                                                                                                                                                                                                                             Reusability
                                                                                          Developer experience
                                                                                                                                                                                                                                                                          Version Control is ’Required’
  3. neither                                                                              Features

                                                                                                Cross-platform compatability                                                                                                                                                      Improve Ease of Coding
  4. important
                                                     0     10          20         30             40           50               60                             0                         5
                                                                                                                                                                           Very Unimportant                10    Neither                                                                  15                                       20
                                                                                                                                                                                                                                                                                                                          Very Important
                                                                                                                                                                                Unimportant Number of respondents (out of 46)
                                                                                                                                                                                                              Important
   5. very important Number of respondents
  Figure 7: Reasons for Choice of Programming Lan-                                                                                  Figure 18: Importance why non-functional require-
                                                                                                                                          Figure 9: Reasons of tools are used
   This scale was chosen so that the relative importance of
  guage                                                                                                                             ments as rated by respondents
non-functional requirements could be determined from re-                                                                                                              60
spondents’ answers. A straight ranking of non-functional re-
quirements would only indicate how important respondents
          Modelling                                                                                                                                                   50
                                                                                                                                    Table 1: Combined important and very important
considered each non-functional requirement in comparison
                                                                                                                                    ratings for non-functional requirements
                                                                                                                                      Number of respondents




          Framework
to others, but would not provide any information regard-                                                                                 40
                                                                                                                                        Ranking Requirement       Combined Important
ing how importantTracking
             Bug/Change a non-functional requirement was over-
                                                                                                                                         30                       and Very Important
all. The neutral response of ‘neither’ was included as some
                                                                                                                                                                      Ratings (%)
respondents may not consider a non-functional requirement
             Build Tools
      Tool type




                                                                                                                                         20 1     Reliability             100
or are unaware of it.
              Libraries/Packages                                                                                                            2     Functionality            95
   Non-functional requirements from the Software Require-
                                                                                                                                            3     Maintainability   [Nguyen-Hoan et al, 2010]
                                                                                                                                                                           90
                                                                                                                                         10
ments Specification Data Item described in United States
                             Testing
The State of Scientific Programming Today


  C, Fortran, CUDA, OpenCL                             MATLAB, Python, R, Perl

  Fast                                                 Productive
  Control over memory allocation                       Low syntactic overhead
  Control over data movement                           Read-eval-print loop (REPL)
  Access to hardware primitives                        Flexible data structures and abstractions
  Portability                                          Nice development environments


  Tedious                                              Slow
  Type annotations, templates, pragmas                 Dynamic lookups and indirection abound
  Obtuse compilers, linkers, preprocessors             Automatic memory management can cause problems
  No support for high-level abstractions




       Scientists relieve the tension by:
       • writing overall control flow and basic data analysis routines in a high-level language
       • calling into a low-level language for performance-critical sections (can be annoying)
The State of Scientific Programming Tomorrow


  C, Fortran, CUDA, OpenCL                             MATLAB, Python, R, Perl

  Fast                                                 Productive
  Control over memory allocation                       Low syntactic overhead
  Control over data movement                           Read-eval-print loop (REPL)
  Access to hardware primitives                        Flexible data structures and abstractions
  Portability                                          Nice development environments


  Tedious                                              Slow
  Type annotations, templates, pragmas                 Dynamic lookups and indirection abound
  Obtuse compilers, linkers, preprocessors             Automatic memory management can cause problems
  No support for high-level abstractions




       Scientists relieve any remaining tension by:
       • writing overall control flow and basic data analysis routines in a high-level language
       • calling into cl.oquence for performance-critical sections (can be annoying)
What is cl.oquence?

  A low-level programming language that maps closely onto, and compiles down to, OpenCL.

What is OpenCL?

  OpenCL is an emerging standard for low-level programming in heterogeneous computing environments. It is
  designed as a library that can be used from a variety of higher-level language.

What is a heterogeneous computing environment?

  A heterogeneous computing environment is an environment where many different compute devices and address
  spaces are available. Devices can include multi-core CPUs (using a variety of instruction sets), GPUs, hybrid-core
  processors like the Cell BE and other specialized accelerators.

Why should I use cl.oquence?

   •   Same core type system (including pointers) and performance profile as OpenCL
   •   Usable from any host language that has OpenCL bindings
   •   Type inference and extension inference eliminates annotational burden
   •   Simplified syntax is a subset of Python, can use existing tools
   •   Structural polymorphism gives you generic programming by default
   •   New features:
       • Higher-order functions
       • Default arguments for functions
   •   Python as the preprocessor and module system
       • Rich support for compile-time metaprogramming
       • Write compiler extensions, new basic types as libraries; modular, clean design
   •   Light-weight and easy to integrate into any build process
   •   Packaged with special Python host bindings that eliminate even basic overhead when using from within Python
       • Built on top of pyopencl and numpy
What is cl.oquence?

  A low-level programming language that maps closely onto, and compiles down to, OpenCL.

What is OpenCL?

  OpenCL is an emerging standard for low-level programming in heterogeneous computing environments. It is
  designed as a library that can be used from a variety of higher-level language.

What is a heterogeneous computing environment?

  A heterogeneous computing environment is an environment where many different compute devices and address
  spaces are available. Devices can include multi-core CPUs (using a variety of instruction sets), GPUs, hybrid-core
  processors like the Cell BE and other specialized accelerators.

Why should I use cl.oquence?

   •   Same core type system (including pointers) and performance profile as OpenCL
   •   Usable from any host language that has OpenCL bindings
   •   Type inference and extension inference eliminates annotational burden
   •   Simplified syntax is a subset of Python, can use existing tools
   •   Structural polymorphism gives you generic programming by default
   •   New features:
       • Higher-order functions
       • Default arguments for functions
   •   Python as the preprocessor and module system
       • Rich support for compile-time metaprogramming
       • Write compiler extensions, new basic types as libraries; modular, clean design
   •   Light-weight and easy to integrate into any build process
   •   Packaged with special Python host bindings that eliminate even basic overhead when using from within Python
       • Built on top of pyopencl and numpy
What is cl.oquence?

  A low-level programming language that maps closely onto, and compiles down to, OpenCL.

What is OpenCL?

  OpenCL is an emerging standard for low-level programming in heterogeneous computing environments. It is
  designed as a library that can be used from a variety of higher-level language.

What is a heterogeneous computing environment?

  A heterogeneous computing environment is one where many different devices and address spaces must be
  managed. Examples of devices include multi-core CPUs (using a variety of instruction sets), GPUs, hybrid-core
  processors like the Cell BE and other specialized accelerators.

Why should I use cl.oquence?

   •   Same core type system (including pointers) and performance profile as OpenCL
   •   Usable from any host language that has OpenCL bindings
   •   Type inference and extension inference eliminates annotational burden
   •   Simplified syntax is a subset of Python, can use existing tools
   •   Structural polymorphism gives you generic programming by default
   •   New features:
       • Higher-order functions
       • Default arguments for functions
   •   Python as the preprocessor and module system
       • Rich support for compile-time metaprogramming
       • Write compiler extensions, new basic types as libraries; modular, clean design
   •   Light-weight and easy to integrate into any build process
   •   Packaged with special Python host bindings that eliminate even basic overhead when using from within Python
       • Built on top of pyopencl and numpy
What is cl.oquence?

  A low-level programming language that maps closely onto, and compiles down to, OpenCL.

What is OpenCL?

  OpenCL is an emerging standard for low-level programming in heterogeneous computing environments. It is
  designed as a library that can be used from a variety of higher-level language.

What is a heterogeneous computing environment?

  A heterogeneous computing environment is one where many different devices and address spaces must be
  managed. Examples of devices include multi-core CPUs (using a variety of instruction sets), GPUs, hybrid-core
  processors like the Cell BE and other specialized accelerators.

Why should I use cl.oquence?

   •   Same core type system (including pointers) and performance profile as OpenCL
   •   Usable from any host language that has OpenCL bindings
   •   Type inference and extension inference eliminates annotational burden
   •   Simplified syntax is a subset of Python, can use existing tools
   •   Structural polymorphism gives you generic programming by default
   •   New features:
       • Higher-order functions
       • Default arguments for functions
   •   Python as the preprocessor and module system
       • Rich support for compile-time metaprogramming
       • Write compiler extensions, new basic types as libraries; modular, clean design
   •   Light-weight and easy to integrate into any build process
   •   Packaged with special Python host bindings that eliminate even basic overhead when using from within Python
       • Built on top of pyopencl and numpy
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum(__global	
  short*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum(__global	
  float*	
  a,	
  __global	
  double*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  ...
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum(__global	
  short*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum(__global	
  float*	
  a,	
  __global	
  double*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  ...
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum(__global	
  short*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum(__global	
  float*	
  a,	
  __global	
  double*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  ...
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_di(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            My photographs tell stories of loss, human struggle, and personal exploration
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {       within landscapes scarred by technology and over-use… [I] strive to metaphorically
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                  and poetically link laborious actions, idiosyncratic rituals and strangely
                                                                                                                           crude machines into tales about our modern experience.
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                                                                                      Robert ParkeHarrison

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}
                                                                                                                           @cl.oquence.fn
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
                                              def	
  plus(a,	
  b):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {         	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                               return	
  a	
  +	
  b
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                          @cl.oquence.fn
                                                                                                                           def	
  mul(a,	
  b):
__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                    return	
  a	
  *	
  b
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum                                                                                       @cl.oquence.fn
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
                                          def	
  ew_op(a,	
  b,	
  dest,	
  op):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {       	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index                                    	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];                                                                   	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
}
                                                                                                                           @cl.oquence.fn
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
                                              def	
  plus(a,	
  b):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {         	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                               return	
  a	
  +	
  b
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                          @cl.oquence.fn
                                                                                                                           def	
  mul(a,	
  b):
__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                    return	
  a	
  *	
  b
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum                                                                                       @cl.oquence.fn
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
                                          def	
  ew_op(a,	
  b,	
  dest,	
  op):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {       	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index                                    	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];                                                                   	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
}
                                                                                                                           @cl.oquence.fn
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
                                              def	
  plus(a,	
  b):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {         	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                               return	
  a	
  +	
  b
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                          @cl.oquence.fn
                                                                                                                           def	
  mul(a,	
  b):
__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                    return	
  a	
  *	
  b
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum                                                                                       @cl.oquence.fn
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
                                          def	
  ew_op(a,	
  b,	
  dest,	
  op):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {       	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index                                    	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];                                                                   	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
}
                                                                                                                           @cl.oquence.fn
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
                                              def	
  plus(a,	
  b):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {         	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                               return	
  a	
  +	
  b
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                          @cl.oquence.fn
                                                                                                                           def	
  mul(a,	
  b):
__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                    return	
  a	
  *	
  b
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum                                                                                       @cl.oquence.fn
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
                                          def	
  ew_op(a,	
  b,	
  dest,	
  op):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {       	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index                                    	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];                                                                   	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
}
                                                                                                                           @cl.oquence.fn
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
                                              def	
  plus(a,	
  b):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {         	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                               return	
  a	
  +	
  b
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                          @cl.oquence.fn
                                                                                                                           def	
  mul(a,	
  b):
__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                    return	
  a	
  *	
  b
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum                                                                                       @cl.oquence.fn
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
                                          def	
  ew_op(a,	
  b,	
  dest,	
  op):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {       	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index                                    	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];                                                                   	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
}
                                                                                                                           @cl.oquence.fn
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
                                              def	
  plus(a,	
  b):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {         	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                               return	
  a	
  +	
  b
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                          @cl.oquence.fn
                                                                                                                           def	
  mul(a,	
  b):
__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                    return	
  a	
  *	
  b
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable                                               These two libraries express the same thing.
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                  The code will run in precisely the same amount of time.
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
Two invocation models
                                               @cl.oquence.fn
 1. Standalone compilation to OpenCL           def	
  ew_op(a,	
  b,	
  dest,	
  op):
     • Use any host language that has OpenCL   	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
       bindings available                      	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
        •C                                     	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
        • C++
        • Fortran                              @cl.oquence.fn
                                               def	
  plus(a,	
  b):
        • MATLAB                               	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
        • Java                                              return	
  a	
  +	
  b
        • .NET
        • Ruby                                 @cl.oquence.fn
                                               def	
  mul(a,	
  b):
        • Python                               	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
                                                            return	
  a	
  *	
  b

                                               #	
  Programmatically	
  specialize	
  and	
  assign	
  types	
  to	
  
                                               #	
  any	
  externally	
  callable	
  versions	
  you	
  need.

                                               sum	
  =	
  ew_op.specialize(op=plus)
                                               prod	
  =	
  ew_op.specialize(op=mul)

                                               g_int_p	
  =	
  cl_int.global_ptr
                                               g_float_p	
  =	
  cl_float.global_ptr

                                               sum_ff	
  =	
  sum.compile(g_float_p,	
  g_float_p,	
  g_float_p)
                                               sum_ii	
  =	
  sum.compile(g_int_p,	
  g_int_p,	
  g_int_p)
Two invocation models
                                                                                                                       @cl.oquence.fn
 1. Standalone compilation to OpenCL                                                                                   def	
  ew_op(a,	
  b,	
  dest,	
  op):
     • Use any host language that has OpenCL                                                                           	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
       bindings available                                                                                              	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
        •C                                                                                                             	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
        • C++
        • Fortran                                                                                                      @cl.oquence.fn
                                                                                                                       def	
  plus(a,	
  b):
        • MATLAB                                                                                                       	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
        • Java                                                                                                                      return	
  a	
  +	
  b
        • .NET
        • Ruby                                                                                                         @cl.oquence.fn
                                                                                                                       def	
  mul(a,	
  b):
        • Python                                                                                                       	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
                                                                                                                                    return	
  a	
  *	
  b

                                                                                                                       #	
  Programmatically	
  specialize	
  and	
  assign	
  types	
  to	
  
clqcc	
  hello.clq                                                                                                     #	
  any	
  externally	
  callable	
  versions	
  you	
  need.

                                                                                                                       sum	
  =	
  ew_op.specialize(op=plus)
     creates hello.cl:                                                                                                 prod	
  =	
  ew_op.specialize(op=mul)

__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
                                      g_int_p	
  =	
  cl_int.global_ptr
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);
                                                                                                                       g_float_p	
  =	
  cl_float.global_ptr
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                      sum_ff	
  =	
  sum.compile(g_float_p,	
  g_float_p,	
  g_float_p)
                                                                                                                       sum_ii	
  =	
  sum.compile(g_int_p,	
  g_int_p,	
  g_int_p)
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}
Two invocation models
                                                  @cl.oquence.fn
 1. Standalone compilation to OpenCL              def	
  ew_op(a,	
  b,	
  dest,	
  op):
 2. Integrated into a host language               	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
     • Python + pyopencl (w/extensions) + numpy   	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
                                                  	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])

                                                  @cl.oquence.fn
                                                  def	
  plus(a,	
  b):
                                                  	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
                                                               return	
  a	
  +	
  b

                                                  @cl.oquence.fn
                                                  def	
  mul(a,	
  b):
                                                  	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
                                                               return	
  a	
  *	
  b

                                                  #	
  allocate	
  two	
  random	
  arrays	
  that	
  we	
  will	
  be	
  adding
                                                  a	
  =	
  numpy.random.rand(50000).astype(numpy.float32)
                                                  b	
  =	
  numpy.random.rand(50000).astype(numpy.float32)

                                                  #	
  transfer	
  data	
  to	
  device
                                                  ctx	
  =	
  cl.ctx	
  =	
  cl.Context.for_device(0,	
  0)
                                                  a_buf	
  =	
  ctx.to_device(a)
                                                  b_buf	
  =	
  ctx.to_device(b)
                                                  dest_buf	
  =	
  ctx.alloc(like=a)

                                                  #	
  invoke	
  function	
  (automatically	
  specialized	
  as	
  needed)
                                                  ew_op(a_buf,	
  b_buf,	
  dest_buf,	
  plus,	
  
                                                  	
  	
  	
  	
  	
  	
  global_size=a.shape,	
  local_size=(256,)).wait()

                                                  #	
  get	
  results
                                                  result	
  =	
  ctx.from_device(dest_buf)

                                                  #	
  check	
  results
                                                  print	
  la.norm(c	
  -­‐	
  (a	
  +	
  b))
Two invocation models
                                                      @cl.oquence.fn
 1. Standalone compilation to OpenCL                  def	
  ew_op(a,	
  b,	
  dest,	
  op):
 2. Integrated into a host language                   	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
     • Python + pyopencl (w/extensions) + numpy       	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
                                                      	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])

                                                      @cl.oquence.fn
                                                      def	
  plus(a,	
  b):
                                                      	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
Four simple memory management functions                            return	
  a	
  +	
  b
 1. to_device: numpy array => new buffer              @cl.oquence.fn
 2. from_device: buffer => new numpy array            def	
  mul(a,	
  b):
 3. alloc: empty buffer                               	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
 4. copy: copies between existing buffers or arrays                return	
  a	
  *	
  b

                                                      #	
  allocate	
  two	
  random	
  arrays	
  that	
  we	
  will	
  be	
  adding
Buffers hold metadata (type, shape, order) so you     a	
  =	
  numpy.random.rand(50000).astype(numpy.float32)
don’t have to provide it.                             b	
  =	
  numpy.random.rand(50000).astype(numpy.float32)

                                                      #	
  transfer	
  data	
  to	
  device
                                                      ctx	
  =	
  cl.ctx	
  =	
  cl.Context.for_device(0,	
  0)
                                                      a_buf	
  =	
  ctx.to_device(a)
                                                      b_buf	
  =	
  ctx.to_device(b)
                                                      dest_buf	
  =	
  ctx.alloc(like=a)

                                                      #	
  invoke	
  function	
  (automatically	
  specialized	
  as	
  needed)
                                                      ew_op(a_buf,	
  b_buf,	
  dest_buf,	
  plus,	
  
                                                      	
  	
  	
  	
  	
  	
  global_size=a.shape,	
  local_size=(256,)).wait()

                                                      #	
  get	
  results
                                                      result	
  =	
  ctx.from_device(dest_buf)

                                                      #	
  check	
  results
                                                      print	
  la.norm(c	
  -­‐	
  (a	
  +	
  b))
Two invocation models
                                                      @cl.oquence.fn
 1. Standalone compilation to OpenCL                  def	
  ew_op(a,	
  b,	
  dest,	
  op):
 2. Integrated into a host language                   	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
     • Python + pyopencl (w/extensions) + numpy       	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
                                                      	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])

                                                      @cl.oquence.fn
                                                      def	
  plus(a,	
  b):
                                                      	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
Four simple memory management functions                            return	
  a	
  +	
  b
 1. to_device: numpy array => new buffer              @cl.oquence.fn
 2. from_device: buffer => new numpy array            def	
  mul(a,	
  b):
 3. alloc: empty buffer                               	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
 4. copy: copies between existing buffers or arrays                return	
  a	
  *	
  b

                                                      #	
  allocate	
  two	
  random	
  arrays	
  that	
  we	
  will	
  be	
  adding
Buffers hold metadata (type, shape, order) so you     a	
  =	
  numpy.random.rand(50000).astype(numpy.float32)
don’t have to provide it.                             b	
  =	
  numpy.random.rand(50000).astype(numpy.float32)

                                                      #	
  transfer	
  data	
  to	
  device
                                                      ctx	
  =	
  cl.ctx	
  =	
  cl.Context.for_device(0,	
  0)
Implicit queue associated with each context.          a_buf	
  =	
  ctx.to_device(a)
                                                      b_buf	
  =	
  ctx.to_device(b)
                                                      dest_buf	
  =	
  ctx.alloc(like=a)

                                                      #	
  invoke	
  function	
  (automatically	
  specialized	
  as	
  needed)
                                                      ew_op(a_buf,	
  b_buf,	
  dest_buf,	
  plus,	
  
                                                      	
  	
  	
  	
  	
  	
  global_size=a.shape,	
  local_size=(256,)).wait()

                                                      #	
  get	
  results
                                                      result	
  =	
  ctx.from_device(dest_buf)

                                                      #	
  check	
  results
                                                      print	
  la.norm(c	
  -­‐	
  (a	
  +	
  b))
Two invocation models                                 @cl.oquence.auto(lambda	
  a,	
  b,	
  dest,	
  op:	
  a.shape,	
  (256,))
 1. Standalone compilation to OpenCL                  @cl.oquence.fn
 2. Integrated into a host language                   def	
  ew_op(a,	
  b,	
  dest,	
  op):
                                                      	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
     • Python + pyopencl (w/extensions) + numpy       	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
                                                      	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])

                                                      @cl.oquence.fn
                                                      def	
  plus(a,	
  b):
                                                      	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
Four simple memory management functions                            return	
  a	
  +	
  b
 1. to_device: numpy array => new buffer              @cl.oquence.fn
 2. from_device: buffer => new numpy array            def	
  mul(a,	
  b):
 3. alloc: empty buffer                               	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
 4. copy: copies between existing buffers or arrays                return	
  a	
  *	
  b

                                                      #	
  allocate	
  two	
  random	
  arrays	
  that	
  we	
  will	
  be	
  adding
Buffers hold metadata (type, shape, order) so you     a	
  =	
  numpy.random.rand(50000).astype(numpy.float32)
don’t have to provide it.                             b	
  =	
  numpy.random.rand(50000).astype(numpy.float32)

                                                      #	
  transfer	
  data	
  to	
  device
                                                      ctx	
  =	
  cl.ctx	
  =	
  cl.Context.for_device(0,	
  0)
Implicit queue associated with each context.          a_buf	
  =	
  ctx.to_device(a)
                                                      b_buf	
  =	
  ctx.to_device(b)
                                                      dest_buf	
  =	
  ctx.alloc(like=a)

The auto annotation can allow you to hide the         #	
  invoke	
  function	
  (automatically	
  specialized	
  as	
  needed)
details of parallelization from the user.             ew_op(a_buf,	
  b_buf,	
  dest_buf,	
  plus).wait()

                                                      #	
  get	
  results
                                                      result	
  =	
  ctx.from_device(dest_buf)

                                                      #	
  check	
  results
                                                      print	
  la.norm(c	
  -­‐	
  (a	
  +	
  b))
Two invocation models                                 @cl.oquence.auto(lambda	
  a,	
  b,	
  dest,	
  op:	
  a.shape,	
  (256,))
 1. Standalone compilation to OpenCL                  @cl.oquence.fn
 2. Integrated into a host language                   def	
  ew_op(a,	
  b,	
  dest,	
  op):
                                                      	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
     • Python + pyopencl (w/extensions) + numpy       	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
                                                      	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])

                                                      @cl.oquence.fn
                                                      def	
  plus(a,	
  b):
                                                      	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
Four simple memory management functions                            return	
  a	
  +	
  b
 1. to_device: numpy array => new buffer              @cl.oquence.fn
 2. from_device: buffer => new numpy array            def	
  mul(a,	
  b):
 3. alloc: empty buffer                               	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
 4. copy: copies between existing buffers or arrays                return	
  a	
  *	
  b

                                                      #	
  allocate	
  two	
  random	
  arrays	
  that	
  we	
  will	
  be	
  adding
Buffers hold metadata (type, shape, order) so you     a	
  =	
  numpy.random.rand(50000).astype(numpy.float32)
don’t have to provide it.                             b	
  =	
  numpy.random.rand(50000).astype(numpy.float32)
                                                      c	
  =	
  numpy.empty_like(a)

                                                      #	
  create	
  an	
  OpenCL	
  context
Implicit queue associated with each context.          ctx	
  =	
  cl.ctx	
  =	
  cl.Context.for_device(0,	
  0)

                                                      #	
  invoke	
  function	
  (automatically	
  specialized	
  as	
  needed)
                                                      ew_op(In(a),	
  In(b),	
  Out(c),	
  plus).wait()
The auto annotation can allow you to hide the
details of parallelization from the user.             #	
  check	
  results
                                                      print	
  la.norm(c	
  -­‐	
  (a	
  +	
  b))


The In, Out and InOut constructs can help
automate data movement when convenient.
OpenCL
//	
  Parallel	
  elementwise	
  sum                                                                                       @cl.oquence.fn
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
                                          def	
  ew_op(a,	
  b,	
  dest,	
  op):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {       	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index                                    	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];                                                                   	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
}
                                                                                                                           @cl.oquence.fn
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
                                              def	
  plus(a,	
  b):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {         	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                               return	
  a	
  +	
  b
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                          @cl.oquence.fn
                                                                                                                           def	
  mul(a,	
  b):
__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                    return	
  a	
  *	
  b
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable                                               These two libraries express the same thing.
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                  The code will run in precisely the same amount of time.
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

...                                           ...

//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];
}

__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
OpenCL
//	
  Parallel	
  elementwise	
  sum                                                                                       @cl.oquence.fn
__kernel	
  void	
  sum_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
                                          def	
  ew_op(a,	
  b,	
  dest,	
  op):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {       	
  	
  	
  	
  '''Parallel	
  elementwise	
  binary	
  operation.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index                                    	
  	
  	
  	
  gid	
  =	
  get_global_id(0)	
  	
  	
  	
  	
  	
  	
  	
  	
  #	
  Get	
  thread	
  index
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];                                                                   	
  	
  	
  	
  dest[gid]	
  =	
  op(a[gid],	
  b[gid])
}
                                                                                                                           @cl.oquence.fn
__kernel	
  void	
  sum_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
                                              def	
  plus(a,	
  b):
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {         	
  	
  	
  	
  '''Adds	
  the	
  two	
  operands.'''
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
                                                                               return	
  a	
  +	
  b
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}                                                                                                                          @cl.oquence.fn
                                                                                                                           def	
  mul(a,	
  b):
__kernel	
  void	
  sum_fi(__global	
  float*	
  a,	
  __global	
  int*	
  b,	
                                            	
  	
  	
  	
  '''Multiplies	
  the	
  two	
  operands.'''
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                    return	
  a	
  *	
  b
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
}

__kernel	
  void	
  sum_df(__global	
  double*	
  a,	
  __global	
  int*	
  b,	
                                              How?
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  double*	
  dest)	
  {
	
  	
  	
  	
  #pragma	
  OPENCL	
  EXTENSION	
  cl_khr_fp64	
  :	
  enable                                                         • cl.oquence.fn code looks like Python, but no!
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  +	
  b[gid];
                                                                                                                                     • Same core type system as OpenCL (C99+)
}                                                                                                                                    • Type inference to eliminate type annotations
                                                                                                                                         (not dynamic lookups)
...                                           ...                                                                                    •   Extension inference to eliminate pragmas
                                                                                                                                     •   Higher-order functions (inlined at compile-time)
//	
  Parallel	
  elementwise	
  product
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ff(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
                                                                                                                                     •   Structural polymorphism
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {                  • All functions are generic by default
	
  	
  	
  	
  size_t	
  gid	
  =	
  get_global_id(0);	
  //	
  Get	
  thread	
  index                                                   • You can call a function with any arguments
	
  	
  	
  	
  dest[gid]	
  =	
  a[gid]	
  *	
  b[gid];                                                                                    that support the operations it uses.
}
                                                                                                                                           •
__kernel	
  void	
  prod(__global	
  float*	
  a,	
  __global	
  float*	
  b,	
  
__kernel	
  void	
  prod_ii(__global	
  int*	
  a,	
  __global	
  int*	
  b,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  float*	
  dest)	
  {
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  __global	
  int*	
  dest)	
  {
[Harvard CS264] 10b - cl.oquence: High-Level Language Abstractions for Low-Level Programming (Cyrus Omar, CMU)
[Harvard CS264] 10b - cl.oquence: High-Level Language Abstractions for Low-Level Programming (Cyrus Omar, CMU)
[Harvard CS264] 10b - cl.oquence: High-Level Language Abstractions for Low-Level Programming (Cyrus Omar, CMU)
[Harvard CS264] 10b - cl.oquence: High-Level Language Abstractions for Low-Level Programming (Cyrus Omar, CMU)
[Harvard CS264] 10b - cl.oquence: High-Level Language Abstractions for Low-Level Programming (Cyrus Omar, CMU)
[Harvard CS264] 10b - cl.oquence: High-Level Language Abstractions for Low-Level Programming (Cyrus Omar, CMU)
[Harvard CS264] 10b - cl.oquence: High-Level Language Abstractions for Low-Level Programming (Cyrus Omar, CMU)

Contenu connexe

En vedette

[Harvard CS264] 12 - Irregular Parallelism on the GPU: Algorithms and Data St...
[Harvard CS264] 12 - Irregular Parallelism on the GPU: Algorithms and Data St...[Harvard CS264] 12 - Irregular Parallelism on the GPU: Algorithms and Data St...
[Harvard CS264] 12 - Irregular Parallelism on the GPU: Algorithms and Data St...npinto
 
[Harvard CS264] 11a - Programming the Memory Hierarchy with Sequoia (Mike Bau...
[Harvard CS264] 11a - Programming the Memory Hierarchy with Sequoia (Mike Bau...[Harvard CS264] 11a - Programming the Memory Hierarchy with Sequoia (Mike Bau...
[Harvard CS264] 11a - Programming the Memory Hierarchy with Sequoia (Mike Bau...npinto
 
[Harvard CS264] 11b - Analysis-Driven Performance Optimization with CUDA (Cli...
[Harvard CS264] 11b - Analysis-Driven Performance Optimization with CUDA (Cli...[Harvard CS264] 11b - Analysis-Driven Performance Optimization with CUDA (Cli...
[Harvard CS264] 11b - Analysis-Driven Performance Optimization with CUDA (Cli...npinto
 
High-Performance Computing Needs Machine Learning... And Vice Versa (NIPS 201...
High-Performance Computing Needs Machine Learning... And Vice Versa (NIPS 201...High-Performance Computing Needs Machine Learning... And Vice Versa (NIPS 201...
High-Performance Computing Needs Machine Learning... And Vice Versa (NIPS 201...npinto
 
[Harvard CS264] 07 - GPU Cluster Programming (MPI & ZeroMQ)
[Harvard CS264] 07 - GPU Cluster Programming (MPI & ZeroMQ)[Harvard CS264] 07 - GPU Cluster Programming (MPI & ZeroMQ)
[Harvard CS264] 07 - GPU Cluster Programming (MPI & ZeroMQ)npinto
 
[Harvard CS264] 10a - Easy, Effective, Efficient: GPU Programming in Python w...
[Harvard CS264] 10a - Easy, Effective, Efficient: GPU Programming in Python w...[Harvard CS264] 10a - Easy, Effective, Efficient: GPU Programming in Python w...
[Harvard CS264] 10a - Easy, Effective, Efficient: GPU Programming in Python w...npinto
 
[Harvard CS264] 08b - MapReduce and Hadoop (Zak Stone, Harvard)
[Harvard CS264] 08b - MapReduce and Hadoop (Zak Stone, Harvard)[Harvard CS264] 08b - MapReduce and Hadoop (Zak Stone, Harvard)
[Harvard CS264] 08b - MapReduce and Hadoop (Zak Stone, Harvard)npinto
 
[Harvard CS264] 09 - Machine Learning on Big Data: Lessons Learned from Googl...
[Harvard CS264] 09 - Machine Learning on Big Data: Lessons Learned from Googl...[Harvard CS264] 09 - Machine Learning on Big Data: Lessons Learned from Googl...
[Harvard CS264] 09 - Machine Learning on Big Data: Lessons Learned from Googl...npinto
 

En vedette (8)

[Harvard CS264] 12 - Irregular Parallelism on the GPU: Algorithms and Data St...
[Harvard CS264] 12 - Irregular Parallelism on the GPU: Algorithms and Data St...[Harvard CS264] 12 - Irregular Parallelism on the GPU: Algorithms and Data St...
[Harvard CS264] 12 - Irregular Parallelism on the GPU: Algorithms and Data St...
 
[Harvard CS264] 11a - Programming the Memory Hierarchy with Sequoia (Mike Bau...
[Harvard CS264] 11a - Programming the Memory Hierarchy with Sequoia (Mike Bau...[Harvard CS264] 11a - Programming the Memory Hierarchy with Sequoia (Mike Bau...
[Harvard CS264] 11a - Programming the Memory Hierarchy with Sequoia (Mike Bau...
 
[Harvard CS264] 11b - Analysis-Driven Performance Optimization with CUDA (Cli...
[Harvard CS264] 11b - Analysis-Driven Performance Optimization with CUDA (Cli...[Harvard CS264] 11b - Analysis-Driven Performance Optimization with CUDA (Cli...
[Harvard CS264] 11b - Analysis-Driven Performance Optimization with CUDA (Cli...
 
High-Performance Computing Needs Machine Learning... And Vice Versa (NIPS 201...
High-Performance Computing Needs Machine Learning... And Vice Versa (NIPS 201...High-Performance Computing Needs Machine Learning... And Vice Versa (NIPS 201...
High-Performance Computing Needs Machine Learning... And Vice Versa (NIPS 201...
 
[Harvard CS264] 07 - GPU Cluster Programming (MPI & ZeroMQ)
[Harvard CS264] 07 - GPU Cluster Programming (MPI & ZeroMQ)[Harvard CS264] 07 - GPU Cluster Programming (MPI & ZeroMQ)
[Harvard CS264] 07 - GPU Cluster Programming (MPI & ZeroMQ)
 
[Harvard CS264] 10a - Easy, Effective, Efficient: GPU Programming in Python w...
[Harvard CS264] 10a - Easy, Effective, Efficient: GPU Programming in Python w...[Harvard CS264] 10a - Easy, Effective, Efficient: GPU Programming in Python w...
[Harvard CS264] 10a - Easy, Effective, Efficient: GPU Programming in Python w...
 
[Harvard CS264] 08b - MapReduce and Hadoop (Zak Stone, Harvard)
[Harvard CS264] 08b - MapReduce and Hadoop (Zak Stone, Harvard)[Harvard CS264] 08b - MapReduce and Hadoop (Zak Stone, Harvard)
[Harvard CS264] 08b - MapReduce and Hadoop (Zak Stone, Harvard)
 
[Harvard CS264] 09 - Machine Learning on Big Data: Lessons Learned from Googl...
[Harvard CS264] 09 - Machine Learning on Big Data: Lessons Learned from Googl...[Harvard CS264] 09 - Machine Learning on Big Data: Lessons Learned from Googl...
[Harvard CS264] 09 - Machine Learning on Big Data: Lessons Learned from Googl...
 

Plus de npinto

"AI" for Blockchain Security (Case Study: Cosmos)
"AI" for Blockchain Security (Case Study: Cosmos)"AI" for Blockchain Security (Case Study: Cosmos)
"AI" for Blockchain Security (Case Study: Cosmos)npinto
 
[Harvard CS264] 06 - CUDA Ninja Tricks: GPU Scripting, Meta-programming & Aut...
[Harvard CS264] 06 - CUDA Ninja Tricks: GPU Scripting, Meta-programming & Aut...[Harvard CS264] 06 - CUDA Ninja Tricks: GPU Scripting, Meta-programming & Aut...
[Harvard CS264] 06 - CUDA Ninja Tricks: GPU Scripting, Meta-programming & Aut...npinto
 
[Harvard CS264] 05 - Advanced-level CUDA Programming
[Harvard CS264] 05 - Advanced-level CUDA Programming[Harvard CS264] 05 - Advanced-level CUDA Programming
[Harvard CS264] 05 - Advanced-level CUDA Programmingnpinto
 
[Harvard CS264] 04 - Intermediate-level CUDA Programming
[Harvard CS264] 04 - Intermediate-level CUDA Programming[Harvard CS264] 04 - Intermediate-level CUDA Programming
[Harvard CS264] 04 - Intermediate-level CUDA Programmingnpinto
 
[Harvard CS264] 03 - Introduction to GPU Computing, CUDA Basics
[Harvard CS264] 03 - Introduction to GPU Computing, CUDA Basics[Harvard CS264] 03 - Introduction to GPU Computing, CUDA Basics
[Harvard CS264] 03 - Introduction to GPU Computing, CUDA Basicsnpinto
 
[Harvard CS264] 02 - Parallel Thinking, Architecture, Theory & Patterns
[Harvard CS264] 02 - Parallel Thinking, Architecture, Theory & Patterns[Harvard CS264] 02 - Parallel Thinking, Architecture, Theory & Patterns
[Harvard CS264] 02 - Parallel Thinking, Architecture, Theory & Patternsnpinto
 
[Harvard CS264] 01 - Introduction
[Harvard CS264] 01 - Introduction[Harvard CS264] 01 - Introduction
[Harvard CS264] 01 - Introductionnpinto
 
IAP09 CUDA@MIT 6.963 - Guest Lecture: Out-of-Core Programming with NVIDIA's C...
IAP09 CUDA@MIT 6.963 - Guest Lecture: Out-of-Core Programming with NVIDIA's C...IAP09 CUDA@MIT 6.963 - Guest Lecture: Out-of-Core Programming with NVIDIA's C...
IAP09 CUDA@MIT 6.963 - Guest Lecture: Out-of-Core Programming with NVIDIA's C...npinto
 
IAP09 CUDA@MIT 6.963 - Guest Lecture: CUDA Tricks and High-Performance Comput...
IAP09 CUDA@MIT 6.963 - Guest Lecture: CUDA Tricks and High-Performance Comput...IAP09 CUDA@MIT 6.963 - Guest Lecture: CUDA Tricks and High-Performance Comput...
IAP09 CUDA@MIT 6.963 - Guest Lecture: CUDA Tricks and High-Performance Comput...npinto
 
IAP09 CUDA@MIT 6.963 - Lecture 07: CUDA Advanced #2 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 07: CUDA Advanced #2 (Nicolas Pinto, MIT)IAP09 CUDA@MIT 6.963 - Lecture 07: CUDA Advanced #2 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 07: CUDA Advanced #2 (Nicolas Pinto, MIT)npinto
 
MIT 6.870 - Template Matching and Histograms (Nicolas Pinto, MIT)
MIT 6.870 - Template Matching and Histograms (Nicolas Pinto, MIT)MIT 6.870 - Template Matching and Histograms (Nicolas Pinto, MIT)
MIT 6.870 - Template Matching and Histograms (Nicolas Pinto, MIT)npinto
 
IAP09 CUDA@MIT 6.963 - Lecture 04: CUDA Advanced #1 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 04: CUDA Advanced #1 (Nicolas Pinto, MIT)IAP09 CUDA@MIT 6.963 - Lecture 04: CUDA Advanced #1 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 04: CUDA Advanced #1 (Nicolas Pinto, MIT)npinto
 
IAP09 CUDA@MIT 6.963 - Lecture 03: CUDA Basics #2 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 03: CUDA Basics #2 (Nicolas Pinto, MIT)IAP09 CUDA@MIT 6.963 - Lecture 03: CUDA Basics #2 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 03: CUDA Basics #2 (Nicolas Pinto, MIT)npinto
 
IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)npinto
 
IAP09 CUDA@MIT 6.963 - Lecture 01: GPU Computing using CUDA (David Luebke, NV...
IAP09 CUDA@MIT 6.963 - Lecture 01: GPU Computing using CUDA (David Luebke, NV...IAP09 CUDA@MIT 6.963 - Lecture 01: GPU Computing using CUDA (David Luebke, NV...
IAP09 CUDA@MIT 6.963 - Lecture 01: GPU Computing using CUDA (David Luebke, NV...npinto
 
IAP09 CUDA@MIT 6.963 - Lecture 01: High-Throughput Scientific Computing (Hans...
IAP09 CUDA@MIT 6.963 - Lecture 01: High-Throughput Scientific Computing (Hans...IAP09 CUDA@MIT 6.963 - Lecture 01: High-Throughput Scientific Computing (Hans...
IAP09 CUDA@MIT 6.963 - Lecture 01: High-Throughput Scientific Computing (Hans...npinto
 

Plus de npinto (16)

"AI" for Blockchain Security (Case Study: Cosmos)
"AI" for Blockchain Security (Case Study: Cosmos)"AI" for Blockchain Security (Case Study: Cosmos)
"AI" for Blockchain Security (Case Study: Cosmos)
 
[Harvard CS264] 06 - CUDA Ninja Tricks: GPU Scripting, Meta-programming & Aut...
[Harvard CS264] 06 - CUDA Ninja Tricks: GPU Scripting, Meta-programming & Aut...[Harvard CS264] 06 - CUDA Ninja Tricks: GPU Scripting, Meta-programming & Aut...
[Harvard CS264] 06 - CUDA Ninja Tricks: GPU Scripting, Meta-programming & Aut...
 
[Harvard CS264] 05 - Advanced-level CUDA Programming
[Harvard CS264] 05 - Advanced-level CUDA Programming[Harvard CS264] 05 - Advanced-level CUDA Programming
[Harvard CS264] 05 - Advanced-level CUDA Programming
 
[Harvard CS264] 04 - Intermediate-level CUDA Programming
[Harvard CS264] 04 - Intermediate-level CUDA Programming[Harvard CS264] 04 - Intermediate-level CUDA Programming
[Harvard CS264] 04 - Intermediate-level CUDA Programming
 
[Harvard CS264] 03 - Introduction to GPU Computing, CUDA Basics
[Harvard CS264] 03 - Introduction to GPU Computing, CUDA Basics[Harvard CS264] 03 - Introduction to GPU Computing, CUDA Basics
[Harvard CS264] 03 - Introduction to GPU Computing, CUDA Basics
 
[Harvard CS264] 02 - Parallel Thinking, Architecture, Theory & Patterns
[Harvard CS264] 02 - Parallel Thinking, Architecture, Theory & Patterns[Harvard CS264] 02 - Parallel Thinking, Architecture, Theory & Patterns
[Harvard CS264] 02 - Parallel Thinking, Architecture, Theory & Patterns
 
[Harvard CS264] 01 - Introduction
[Harvard CS264] 01 - Introduction[Harvard CS264] 01 - Introduction
[Harvard CS264] 01 - Introduction
 
IAP09 CUDA@MIT 6.963 - Guest Lecture: Out-of-Core Programming with NVIDIA's C...
IAP09 CUDA@MIT 6.963 - Guest Lecture: Out-of-Core Programming with NVIDIA's C...IAP09 CUDA@MIT 6.963 - Guest Lecture: Out-of-Core Programming with NVIDIA's C...
IAP09 CUDA@MIT 6.963 - Guest Lecture: Out-of-Core Programming with NVIDIA's C...
 
IAP09 CUDA@MIT 6.963 - Guest Lecture: CUDA Tricks and High-Performance Comput...
IAP09 CUDA@MIT 6.963 - Guest Lecture: CUDA Tricks and High-Performance Comput...IAP09 CUDA@MIT 6.963 - Guest Lecture: CUDA Tricks and High-Performance Comput...
IAP09 CUDA@MIT 6.963 - Guest Lecture: CUDA Tricks and High-Performance Comput...
 
IAP09 CUDA@MIT 6.963 - Lecture 07: CUDA Advanced #2 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 07: CUDA Advanced #2 (Nicolas Pinto, MIT)IAP09 CUDA@MIT 6.963 - Lecture 07: CUDA Advanced #2 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 07: CUDA Advanced #2 (Nicolas Pinto, MIT)
 
MIT 6.870 - Template Matching and Histograms (Nicolas Pinto, MIT)
MIT 6.870 - Template Matching and Histograms (Nicolas Pinto, MIT)MIT 6.870 - Template Matching and Histograms (Nicolas Pinto, MIT)
MIT 6.870 - Template Matching and Histograms (Nicolas Pinto, MIT)
 
IAP09 CUDA@MIT 6.963 - Lecture 04: CUDA Advanced #1 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 04: CUDA Advanced #1 (Nicolas Pinto, MIT)IAP09 CUDA@MIT 6.963 - Lecture 04: CUDA Advanced #1 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 04: CUDA Advanced #1 (Nicolas Pinto, MIT)
 
IAP09 CUDA@MIT 6.963 - Lecture 03: CUDA Basics #2 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 03: CUDA Basics #2 (Nicolas Pinto, MIT)IAP09 CUDA@MIT 6.963 - Lecture 03: CUDA Basics #2 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 03: CUDA Basics #2 (Nicolas Pinto, MIT)
 
IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)
IAP09 CUDA@MIT 6.963 - Lecture 02: CUDA Basics #1 (Nicolas Pinto, MIT)
 
IAP09 CUDA@MIT 6.963 - Lecture 01: GPU Computing using CUDA (David Luebke, NV...
IAP09 CUDA@MIT 6.963 - Lecture 01: GPU Computing using CUDA (David Luebke, NV...IAP09 CUDA@MIT 6.963 - Lecture 01: GPU Computing using CUDA (David Luebke, NV...
IAP09 CUDA@MIT 6.963 - Lecture 01: GPU Computing using CUDA (David Luebke, NV...
 
IAP09 CUDA@MIT 6.963 - Lecture 01: High-Throughput Scientific Computing (Hans...
IAP09 CUDA@MIT 6.963 - Lecture 01: High-Throughput Scientific Computing (Hans...IAP09 CUDA@MIT 6.963 - Lecture 01: High-Throughput Scientific Computing (Hans...
IAP09 CUDA@MIT 6.963 - Lecture 01: High-Throughput Scientific Computing (Hans...
 

Dernier

Sensory_Experience_and_Emotional_Resonance_in_Gabriel_Okaras_The_Piano_and_Th...
Sensory_Experience_and_Emotional_Resonance_in_Gabriel_Okaras_The_Piano_and_Th...Sensory_Experience_and_Emotional_Resonance_in_Gabriel_Okaras_The_Piano_and_Th...
Sensory_Experience_and_Emotional_Resonance_in_Gabriel_Okaras_The_Piano_and_Th...Pooja Bhuva
 
Interdisciplinary_Insights_Data_Collection_Methods.pptx
Interdisciplinary_Insights_Data_Collection_Methods.pptxInterdisciplinary_Insights_Data_Collection_Methods.pptx
Interdisciplinary_Insights_Data_Collection_Methods.pptxPooja Bhuva
 
FSB Advising Checklist - Orientation 2024
FSB Advising Checklist - Orientation 2024FSB Advising Checklist - Orientation 2024
FSB Advising Checklist - Orientation 2024Elizabeth Walsh
 
Kodo Millet PPT made by Ghanshyam bairwa college of Agriculture kumher bhara...
Kodo Millet  PPT made by Ghanshyam bairwa college of Agriculture kumher bhara...Kodo Millet  PPT made by Ghanshyam bairwa college of Agriculture kumher bhara...
Kodo Millet PPT made by Ghanshyam bairwa college of Agriculture kumher bhara...pradhanghanshyam7136
 
Understanding Accommodations and Modifications
Understanding  Accommodations and ModificationsUnderstanding  Accommodations and Modifications
Understanding Accommodations and ModificationsMJDuyan
 
The basics of sentences session 3pptx.pptx
The basics of sentences session 3pptx.pptxThe basics of sentences session 3pptx.pptx
The basics of sentences session 3pptx.pptxheathfieldcps1
 
Python Notes for mca i year students osmania university.docx
Python Notes for mca i year students osmania university.docxPython Notes for mca i year students osmania university.docx
Python Notes for mca i year students osmania university.docxRamakrishna Reddy Bijjam
 
Food safety_Challenges food safety laboratories_.pdf
Food safety_Challenges food safety laboratories_.pdfFood safety_Challenges food safety laboratories_.pdf
Food safety_Challenges food safety laboratories_.pdfSherif Taha
 
Sociology 101 Demonstration of Learning Exhibit
Sociology 101 Demonstration of Learning ExhibitSociology 101 Demonstration of Learning Exhibit
Sociology 101 Demonstration of Learning Exhibitjbellavia9
 
REMIFENTANIL: An Ultra short acting opioid.pptx
REMIFENTANIL: An Ultra short acting opioid.pptxREMIFENTANIL: An Ultra short acting opioid.pptx
REMIFENTANIL: An Ultra short acting opioid.pptxDr. Ravikiran H M Gowda
 
Holdier Curriculum Vitae (April 2024).pdf
Holdier Curriculum Vitae (April 2024).pdfHoldier Curriculum Vitae (April 2024).pdf
Holdier Curriculum Vitae (April 2024).pdfagholdier
 
Spellings Wk 3 English CAPS CARES Please Practise
Spellings Wk 3 English CAPS CARES Please PractiseSpellings Wk 3 English CAPS CARES Please Practise
Spellings Wk 3 English CAPS CARES Please PractiseAnaAcapella
 
Google Gemini An AI Revolution in Education.pptx
Google Gemini An AI Revolution in Education.pptxGoogle Gemini An AI Revolution in Education.pptx
Google Gemini An AI Revolution in Education.pptxDr. Sarita Anand
 
SKILL OF INTRODUCING THE LESSON MICRO SKILLS.pptx
SKILL OF INTRODUCING THE LESSON MICRO SKILLS.pptxSKILL OF INTRODUCING THE LESSON MICRO SKILLS.pptx
SKILL OF INTRODUCING THE LESSON MICRO SKILLS.pptxAmanpreet Kaur
 
How to Manage Global Discount in Odoo 17 POS
How to Manage Global Discount in Odoo 17 POSHow to Manage Global Discount in Odoo 17 POS
How to Manage Global Discount in Odoo 17 POSCeline George
 
Jamworks pilot and AI at Jisc (20/03/2024)
Jamworks pilot and AI at Jisc (20/03/2024)Jamworks pilot and AI at Jisc (20/03/2024)
Jamworks pilot and AI at Jisc (20/03/2024)Jisc
 
2024-NATIONAL-LEARNING-CAMP-AND-OTHER.pptx
2024-NATIONAL-LEARNING-CAMP-AND-OTHER.pptx2024-NATIONAL-LEARNING-CAMP-AND-OTHER.pptx
2024-NATIONAL-LEARNING-CAMP-AND-OTHER.pptxMaritesTamaniVerdade
 
Basic Civil Engineering first year Notes- Chapter 4 Building.pptx
Basic Civil Engineering first year Notes- Chapter 4 Building.pptxBasic Civil Engineering first year Notes- Chapter 4 Building.pptx
Basic Civil Engineering first year Notes- Chapter 4 Building.pptxDenish Jangid
 
Fostering Friendships - Enhancing Social Bonds in the Classroom
Fostering Friendships - Enhancing Social Bonds  in the ClassroomFostering Friendships - Enhancing Social Bonds  in the Classroom
Fostering Friendships - Enhancing Social Bonds in the ClassroomPooky Knightsmith
 

Dernier (20)

Sensory_Experience_and_Emotional_Resonance_in_Gabriel_Okaras_The_Piano_and_Th...
Sensory_Experience_and_Emotional_Resonance_in_Gabriel_Okaras_The_Piano_and_Th...Sensory_Experience_and_Emotional_Resonance_in_Gabriel_Okaras_The_Piano_and_Th...
Sensory_Experience_and_Emotional_Resonance_in_Gabriel_Okaras_The_Piano_and_Th...
 
Interdisciplinary_Insights_Data_Collection_Methods.pptx
Interdisciplinary_Insights_Data_Collection_Methods.pptxInterdisciplinary_Insights_Data_Collection_Methods.pptx
Interdisciplinary_Insights_Data_Collection_Methods.pptx
 
FSB Advising Checklist - Orientation 2024
FSB Advising Checklist - Orientation 2024FSB Advising Checklist - Orientation 2024
FSB Advising Checklist - Orientation 2024
 
Kodo Millet PPT made by Ghanshyam bairwa college of Agriculture kumher bhara...
Kodo Millet  PPT made by Ghanshyam bairwa college of Agriculture kumher bhara...Kodo Millet  PPT made by Ghanshyam bairwa college of Agriculture kumher bhara...
Kodo Millet PPT made by Ghanshyam bairwa college of Agriculture kumher bhara...
 
Understanding Accommodations and Modifications
Understanding  Accommodations and ModificationsUnderstanding  Accommodations and Modifications
Understanding Accommodations and Modifications
 
The basics of sentences session 3pptx.pptx
The basics of sentences session 3pptx.pptxThe basics of sentences session 3pptx.pptx
The basics of sentences session 3pptx.pptx
 
Python Notes for mca i year students osmania university.docx
Python Notes for mca i year students osmania university.docxPython Notes for mca i year students osmania university.docx
Python Notes for mca i year students osmania university.docx
 
Food safety_Challenges food safety laboratories_.pdf
Food safety_Challenges food safety laboratories_.pdfFood safety_Challenges food safety laboratories_.pdf
Food safety_Challenges food safety laboratories_.pdf
 
Sociology 101 Demonstration of Learning Exhibit
Sociology 101 Demonstration of Learning ExhibitSociology 101 Demonstration of Learning Exhibit
Sociology 101 Demonstration of Learning Exhibit
 
REMIFENTANIL: An Ultra short acting opioid.pptx
REMIFENTANIL: An Ultra short acting opioid.pptxREMIFENTANIL: An Ultra short acting opioid.pptx
REMIFENTANIL: An Ultra short acting opioid.pptx
 
Holdier Curriculum Vitae (April 2024).pdf
Holdier Curriculum Vitae (April 2024).pdfHoldier Curriculum Vitae (April 2024).pdf
Holdier Curriculum Vitae (April 2024).pdf
 
Spatium Project Simulation student brief
Spatium Project Simulation student briefSpatium Project Simulation student brief
Spatium Project Simulation student brief
 
Spellings Wk 3 English CAPS CARES Please Practise
Spellings Wk 3 English CAPS CARES Please PractiseSpellings Wk 3 English CAPS CARES Please Practise
Spellings Wk 3 English CAPS CARES Please Practise
 
Google Gemini An AI Revolution in Education.pptx
Google Gemini An AI Revolution in Education.pptxGoogle Gemini An AI Revolution in Education.pptx
Google Gemini An AI Revolution in Education.pptx
 
SKILL OF INTRODUCING THE LESSON MICRO SKILLS.pptx
SKILL OF INTRODUCING THE LESSON MICRO SKILLS.pptxSKILL OF INTRODUCING THE LESSON MICRO SKILLS.pptx
SKILL OF INTRODUCING THE LESSON MICRO SKILLS.pptx
 
How to Manage Global Discount in Odoo 17 POS
How to Manage Global Discount in Odoo 17 POSHow to Manage Global Discount in Odoo 17 POS
How to Manage Global Discount in Odoo 17 POS
 
Jamworks pilot and AI at Jisc (20/03/2024)
Jamworks pilot and AI at Jisc (20/03/2024)Jamworks pilot and AI at Jisc (20/03/2024)
Jamworks pilot and AI at Jisc (20/03/2024)
 
2024-NATIONAL-LEARNING-CAMP-AND-OTHER.pptx
2024-NATIONAL-LEARNING-CAMP-AND-OTHER.pptx2024-NATIONAL-LEARNING-CAMP-AND-OTHER.pptx
2024-NATIONAL-LEARNING-CAMP-AND-OTHER.pptx
 
Basic Civil Engineering first year Notes- Chapter 4 Building.pptx
Basic Civil Engineering first year Notes- Chapter 4 Building.pptxBasic Civil Engineering first year Notes- Chapter 4 Building.pptx
Basic Civil Engineering first year Notes- Chapter 4 Building.pptx
 
Fostering Friendships - Enhancing Social Bonds in the Classroom
Fostering Friendships - Enhancing Social Bonds  in the ClassroomFostering Friendships - Enhancing Social Bonds  in the Classroom
Fostering Friendships - Enhancing Social Bonds in the Classroom
 

[Harvard CS264] 10b - cl.oquence: High-Level Language Abstractions for Low-Level Programming (Cyrus Omar, CMU)

  • 1. High-Level Language Features for Low-Level Programming Cyrus Omar Computer Science Department Carnegie Mellon University http://www.cs.cmu.edu/~comar/
  • 2. The Needs of Scientists and Engineers ment group) or letting the users/stakeholders know how the 100 software works (open source, scientific paper publication). Other Using Tool is ’Standard’ 80 Reason given for use of programming language Only language known 3.8 Non-functional requirements % of respondents Tool is Easy to Use Required 60 The respondents were asked to rate a series of non-functional Open Source Reason for use of tools Favourite requirements on the following Likert scale: 40 Performance Cost (or lack thereof) of Tool 20 1. very unimportant Legacy Project Organisation Ease of use 0 2. unimportant Features Reliability Functionality Usability Availability Flexibility Performance* Portability Testability Maintainability Tracability* Reusability Developer experience Version Control is ’Required’ 3. neither Features Cross-platform compatability Improve Ease of Coding 4. important 0 10 20 30 40 50 60 0 5 Very Unimportant 10 Neither 15 20 Very Important Unimportant Number of respondents (out of 46) Important 5. very important Number of respondents Figure 7: Reasons for Choice of Programming Lan- Figure 18: Importance why non-functional require- Figure 9: Reasons of tools are used This scale was chosen so that the relative importance of guage ments as rated by respondents non-functional requirements could be determined from re- 60 spondents’ answers. A straight ranking of non-functional re- quirements would only indicate how important respondents Modelling 50 Table 1: Combined important and very important considered each non-functional requirement in comparison ratings for non-functional requirements Number of respondents Framework to others, but would not provide any information regard- 40 Ranking Requirement Combined Important ing how importantTracking Bug/Change a non-functional requirement was over- 30 and Very Important all. The neutral response of ‘neither’ was included as some Ratings (%) respondents may not consider a non-functional requirement Build Tools Tool type 20 1 Reliability 100 or are unaware of it. Libraries/Packages 2 Functionality 95 Non-functional requirements from the Software Require- 3 Maintainability [Nguyen-Hoan et al, 2010] 90 10 ments Specification Data Item described in United States Testing
  • 3. The State of Scientific Programming Today C, Fortran, CUDA, OpenCL MATLAB, Python, R, Perl Fast Productive Control over memory allocation Low syntactic overhead Control over data movement Read-eval-print loop (REPL) Access to hardware primitives Flexible data structures and abstractions Portability Nice development environments Tedious Slow Type annotations, templates, pragmas Dynamic lookups and indirection abound Obtuse compilers, linkers, preprocessors Automatic memory management can cause problems No support for high-level abstractions Scientists relieve the tension by: • writing overall control flow and basic data analysis routines in a high-level language • calling into a low-level language for performance-critical sections (can be annoying)
  • 4. The State of Scientific Programming Tomorrow C, Fortran, CUDA, OpenCL MATLAB, Python, R, Perl Fast Productive Control over memory allocation Low syntactic overhead Control over data movement Read-eval-print loop (REPL) Access to hardware primitives Flexible data structures and abstractions Portability Nice development environments Tedious Slow Type annotations, templates, pragmas Dynamic lookups and indirection abound Obtuse compilers, linkers, preprocessors Automatic memory management can cause problems No support for high-level abstractions Scientists relieve any remaining tension by: • writing overall control flow and basic data analysis routines in a high-level language • calling into cl.oquence for performance-critical sections (can be annoying)
  • 5. What is cl.oquence? A low-level programming language that maps closely onto, and compiles down to, OpenCL. What is OpenCL? OpenCL is an emerging standard for low-level programming in heterogeneous computing environments. It is designed as a library that can be used from a variety of higher-level language. What is a heterogeneous computing environment? A heterogeneous computing environment is an environment where many different compute devices and address spaces are available. Devices can include multi-core CPUs (using a variety of instruction sets), GPUs, hybrid-core processors like the Cell BE and other specialized accelerators. Why should I use cl.oquence? • Same core type system (including pointers) and performance profile as OpenCL • Usable from any host language that has OpenCL bindings • Type inference and extension inference eliminates annotational burden • Simplified syntax is a subset of Python, can use existing tools • Structural polymorphism gives you generic programming by default • New features: • Higher-order functions • Default arguments for functions • Python as the preprocessor and module system • Rich support for compile-time metaprogramming • Write compiler extensions, new basic types as libraries; modular, clean design • Light-weight and easy to integrate into any build process • Packaged with special Python host bindings that eliminate even basic overhead when using from within Python • Built on top of pyopencl and numpy
  • 6. What is cl.oquence? A low-level programming language that maps closely onto, and compiles down to, OpenCL. What is OpenCL? OpenCL is an emerging standard for low-level programming in heterogeneous computing environments. It is designed as a library that can be used from a variety of higher-level language. What is a heterogeneous computing environment? A heterogeneous computing environment is an environment where many different compute devices and address spaces are available. Devices can include multi-core CPUs (using a variety of instruction sets), GPUs, hybrid-core processors like the Cell BE and other specialized accelerators. Why should I use cl.oquence? • Same core type system (including pointers) and performance profile as OpenCL • Usable from any host language that has OpenCL bindings • Type inference and extension inference eliminates annotational burden • Simplified syntax is a subset of Python, can use existing tools • Structural polymorphism gives you generic programming by default • New features: • Higher-order functions • Default arguments for functions • Python as the preprocessor and module system • Rich support for compile-time metaprogramming • Write compiler extensions, new basic types as libraries; modular, clean design • Light-weight and easy to integrate into any build process • Packaged with special Python host bindings that eliminate even basic overhead when using from within Python • Built on top of pyopencl and numpy
  • 7. What is cl.oquence? A low-level programming language that maps closely onto, and compiles down to, OpenCL. What is OpenCL? OpenCL is an emerging standard for low-level programming in heterogeneous computing environments. It is designed as a library that can be used from a variety of higher-level language. What is a heterogeneous computing environment? A heterogeneous computing environment is one where many different devices and address spaces must be managed. Examples of devices include multi-core CPUs (using a variety of instruction sets), GPUs, hybrid-core processors like the Cell BE and other specialized accelerators. Why should I use cl.oquence? • Same core type system (including pointers) and performance profile as OpenCL • Usable from any host language that has OpenCL bindings • Type inference and extension inference eliminates annotational burden • Simplified syntax is a subset of Python, can use existing tools • Structural polymorphism gives you generic programming by default • New features: • Higher-order functions • Default arguments for functions • Python as the preprocessor and module system • Rich support for compile-time metaprogramming • Write compiler extensions, new basic types as libraries; modular, clean design • Light-weight and easy to integrate into any build process • Packaged with special Python host bindings that eliminate even basic overhead when using from within Python • Built on top of pyopencl and numpy
  • 8. What is cl.oquence? A low-level programming language that maps closely onto, and compiles down to, OpenCL. What is OpenCL? OpenCL is an emerging standard for low-level programming in heterogeneous computing environments. It is designed as a library that can be used from a variety of higher-level language. What is a heterogeneous computing environment? A heterogeneous computing environment is one where many different devices and address spaces must be managed. Examples of devices include multi-core CPUs (using a variety of instruction sets), GPUs, hybrid-core processors like the Cell BE and other specialized accelerators. Why should I use cl.oquence? • Same core type system (including pointers) and performance profile as OpenCL • Usable from any host language that has OpenCL bindings • Type inference and extension inference eliminates annotational burden • Simplified syntax is a subset of Python, can use existing tools • Structural polymorphism gives you generic programming by default • New features: • Higher-order functions • Default arguments for functions • Python as the preprocessor and module system • Rich support for compile-time metaprogramming • Write compiler extensions, new basic types as libraries; modular, clean design • Light-weight and easy to integrate into any build process • Packaged with special Python host bindings that eliminate even basic overhead when using from within Python • Built on top of pyopencl and numpy
  • 9. OpenCL //  Parallel  elementwise  sum __kernel  void  sum(__global  float*  a,  __global  float*  b,                                      __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum(__global  int*  a,  __global  int*  b,                                      __global  int*  dest)  {        size_t  gid  =  get_global_id(0);        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum(__global  short*  a,  __global  int*  b,                                      __global  int*  dest)  {        size_t  gid  =  get_global_id(0);        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum(__global  float*  a,  __global  double*  b,                                      __global  float*  dest)  {        #pragma  ...        size_t  gid  =  get_global_id(0);        dest[gid]  =  a[gid]  +  b[gid]; } ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {
  • 10. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum(__global  short*  a,  __global  int*  b,                                      __global  int*  dest)  {        size_t  gid  =  get_global_id(0);        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum(__global  float*  a,  __global  double*  b,                                      __global  float*  dest)  {        #pragma  ...        size_t  gid  =  get_global_id(0);        dest[gid]  =  a[gid]  +  b[gid]; } ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {
  • 11. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum(__global  short*  a,  __global  int*  b,                                      __global  int*  dest)  {        size_t  gid  =  get_global_id(0);        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum(__global  float*  a,  __global  double*  b,                                      __global  float*  dest)  {        #pragma  ...        size_t  gid  =  get_global_id(0);        dest[gid]  =  a[gid]  +  b[gid]; } ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {
  • 12. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_di(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {
  • 13. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {
  • 14. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {
  • 15. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 16. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 17. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,   My photographs tell stories of loss, human struggle, and personal exploration                                          __global  float*  dest)  { within landscapes scarred by technology and over-use… [I] strive to metaphorically        size_t  gid  =  get_global_id(0);   and poetically link laborious actions, idiosyncratic rituals and strangely crude machines into tales about our modern experience.        dest[gid]  =  a[gid]  +  b[gid]; } Robert ParkeHarrison __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 18. OpenCL //  Parallel  elementwise  sum __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,   def  plus(a,  b):                                          __global  int*  dest)  {        '''Adds  the  two  operands.'''        size_t  gid  =  get_global_id(0);   return  a  +  b        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn def  mul(a,  b): __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,          '''Multiplies  the  two  operands.'''                                          __global  float*  dest)  { return  a  *  b        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 19. OpenCL //  Parallel  elementwise  sum @cl.oquence.fn __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,   def  ew_op(a,  b,  dest,  op):                                          __global  float*  dest)  {        '''Parallel  elementwise  binary  operation.'''        size_t  gid  =  get_global_id(0);  //  Get  thread  index        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid];        dest[gid]  =  op(a[gid],  b[gid]) } @cl.oquence.fn __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,   def  plus(a,  b):                                          __global  int*  dest)  {        '''Adds  the  two  operands.'''        size_t  gid  =  get_global_id(0);   return  a  +  b        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn def  mul(a,  b): __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,          '''Multiplies  the  two  operands.'''                                          __global  float*  dest)  { return  a  *  b        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 20. OpenCL //  Parallel  elementwise  sum @cl.oquence.fn __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,   def  ew_op(a,  b,  dest,  op):                                          __global  float*  dest)  {        '''Parallel  elementwise  binary  operation.'''        size_t  gid  =  get_global_id(0);  //  Get  thread  index        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid];        dest[gid]  =  op(a[gid],  b[gid]) } @cl.oquence.fn __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,   def  plus(a,  b):                                          __global  int*  dest)  {        '''Adds  the  two  operands.'''        size_t  gid  =  get_global_id(0);   return  a  +  b        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn def  mul(a,  b): __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,          '''Multiplies  the  two  operands.'''                                          __global  float*  dest)  { return  a  *  b        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 21. OpenCL //  Parallel  elementwise  sum @cl.oquence.fn __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,   def  ew_op(a,  b,  dest,  op):                                          __global  float*  dest)  {        '''Parallel  elementwise  binary  operation.'''        size_t  gid  =  get_global_id(0);  //  Get  thread  index        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid];        dest[gid]  =  op(a[gid],  b[gid]) } @cl.oquence.fn __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,   def  plus(a,  b):                                          __global  int*  dest)  {        '''Adds  the  two  operands.'''        size_t  gid  =  get_global_id(0);   return  a  +  b        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn def  mul(a,  b): __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,          '''Multiplies  the  two  operands.'''                                          __global  float*  dest)  { return  a  *  b        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 22. OpenCL //  Parallel  elementwise  sum @cl.oquence.fn __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,   def  ew_op(a,  b,  dest,  op):                                          __global  float*  dest)  {        '''Parallel  elementwise  binary  operation.'''        size_t  gid  =  get_global_id(0);  //  Get  thread  index        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid];        dest[gid]  =  op(a[gid],  b[gid]) } @cl.oquence.fn __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,   def  plus(a,  b):                                          __global  int*  dest)  {        '''Adds  the  two  operands.'''        size_t  gid  =  get_global_id(0);   return  a  +  b        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn def  mul(a,  b): __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,          '''Multiplies  the  two  operands.'''                                          __global  float*  dest)  { return  a  *  b        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 23. OpenCL //  Parallel  elementwise  sum @cl.oquence.fn __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,   def  ew_op(a,  b,  dest,  op):                                          __global  float*  dest)  {        '''Parallel  elementwise  binary  operation.'''        size_t  gid  =  get_global_id(0);  //  Get  thread  index        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid];        dest[gid]  =  op(a[gid],  b[gid]) } @cl.oquence.fn __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,   def  plus(a,  b):                                          __global  int*  dest)  {        '''Adds  the  two  operands.'''        size_t  gid  =  get_global_id(0);   return  a  +  b        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn def  mul(a,  b): __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,          '''Multiplies  the  two  operands.'''                                          __global  float*  dest)  { return  a  *  b        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable These two libraries express the same thing.        size_t  gid  =  get_global_id(0);   The code will run in precisely the same amount of time.        dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 24. Two invocation models @cl.oquence.fn 1. Standalone compilation to OpenCL def  ew_op(a,  b,  dest,  op): • Use any host language that has OpenCL        '''Parallel  elementwise  binary  operation.''' bindings available        gid  =  get_global_id(0)                  #  Get  thread  index •C        dest[gid]  =  op(a[gid],  b[gid]) • C++ • Fortran @cl.oquence.fn def  plus(a,  b): • MATLAB        '''Adds  the  two  operands.''' • Java return  a  +  b • .NET • Ruby @cl.oquence.fn def  mul(a,  b): • Python        '''Multiplies  the  two  operands.''' return  a  *  b #  Programmatically  specialize  and  assign  types  to   #  any  externally  callable  versions  you  need. sum  =  ew_op.specialize(op=plus) prod  =  ew_op.specialize(op=mul) g_int_p  =  cl_int.global_ptr g_float_p  =  cl_float.global_ptr sum_ff  =  sum.compile(g_float_p,  g_float_p,  g_float_p) sum_ii  =  sum.compile(g_int_p,  g_int_p,  g_int_p)
  • 25. Two invocation models @cl.oquence.fn 1. Standalone compilation to OpenCL def  ew_op(a,  b,  dest,  op): • Use any host language that has OpenCL        '''Parallel  elementwise  binary  operation.''' bindings available        gid  =  get_global_id(0)                  #  Get  thread  index •C        dest[gid]  =  op(a[gid],  b[gid]) • C++ • Fortran @cl.oquence.fn def  plus(a,  b): • MATLAB        '''Adds  the  two  operands.''' • Java return  a  +  b • .NET • Ruby @cl.oquence.fn def  mul(a,  b): • Python        '''Multiplies  the  two  operands.''' return  a  *  b #  Programmatically  specialize  and  assign  types  to   clqcc  hello.clq #  any  externally  callable  versions  you  need. sum  =  ew_op.specialize(op=plus) creates hello.cl: prod  =  ew_op.specialize(op=mul) __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,   g_int_p  =  cl_int.global_ptr                                          __global  float*  dest)  {        size_t  gid  =  get_global_id(0); g_float_p  =  cl_float.global_ptr        dest[gid]  =  a[gid]  +  b[gid]; } sum_ff  =  sum.compile(g_float_p,  g_float_p,  g_float_p) sum_ii  =  sum.compile(g_int_p,  g_int_p,  g_int_p) __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,                                            __global  int*  dest)  {        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; }
  • 26. Two invocation models @cl.oquence.fn 1. Standalone compilation to OpenCL def  ew_op(a,  b,  dest,  op): 2. Integrated into a host language        '''Parallel  elementwise  binary  operation.''' • Python + pyopencl (w/extensions) + numpy        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  op(a[gid],  b[gid]) @cl.oquence.fn def  plus(a,  b):        '''Adds  the  two  operands.''' return  a  +  b @cl.oquence.fn def  mul(a,  b):        '''Multiplies  the  two  operands.''' return  a  *  b #  allocate  two  random  arrays  that  we  will  be  adding a  =  numpy.random.rand(50000).astype(numpy.float32) b  =  numpy.random.rand(50000).astype(numpy.float32) #  transfer  data  to  device ctx  =  cl.ctx  =  cl.Context.for_device(0,  0) a_buf  =  ctx.to_device(a) b_buf  =  ctx.to_device(b) dest_buf  =  ctx.alloc(like=a) #  invoke  function  (automatically  specialized  as  needed) ew_op(a_buf,  b_buf,  dest_buf,  plus,              global_size=a.shape,  local_size=(256,)).wait() #  get  results result  =  ctx.from_device(dest_buf) #  check  results print  la.norm(c  -­‐  (a  +  b))
  • 27. Two invocation models @cl.oquence.fn 1. Standalone compilation to OpenCL def  ew_op(a,  b,  dest,  op): 2. Integrated into a host language        '''Parallel  elementwise  binary  operation.''' • Python + pyopencl (w/extensions) + numpy        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  op(a[gid],  b[gid]) @cl.oquence.fn def  plus(a,  b):        '''Adds  the  two  operands.''' Four simple memory management functions return  a  +  b 1. to_device: numpy array => new buffer @cl.oquence.fn 2. from_device: buffer => new numpy array def  mul(a,  b): 3. alloc: empty buffer        '''Multiplies  the  two  operands.''' 4. copy: copies between existing buffers or arrays return  a  *  b #  allocate  two  random  arrays  that  we  will  be  adding Buffers hold metadata (type, shape, order) so you a  =  numpy.random.rand(50000).astype(numpy.float32) don’t have to provide it. b  =  numpy.random.rand(50000).astype(numpy.float32) #  transfer  data  to  device ctx  =  cl.ctx  =  cl.Context.for_device(0,  0) a_buf  =  ctx.to_device(a) b_buf  =  ctx.to_device(b) dest_buf  =  ctx.alloc(like=a) #  invoke  function  (automatically  specialized  as  needed) ew_op(a_buf,  b_buf,  dest_buf,  plus,              global_size=a.shape,  local_size=(256,)).wait() #  get  results result  =  ctx.from_device(dest_buf) #  check  results print  la.norm(c  -­‐  (a  +  b))
  • 28. Two invocation models @cl.oquence.fn 1. Standalone compilation to OpenCL def  ew_op(a,  b,  dest,  op): 2. Integrated into a host language        '''Parallel  elementwise  binary  operation.''' • Python + pyopencl (w/extensions) + numpy        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  op(a[gid],  b[gid]) @cl.oquence.fn def  plus(a,  b):        '''Adds  the  two  operands.''' Four simple memory management functions return  a  +  b 1. to_device: numpy array => new buffer @cl.oquence.fn 2. from_device: buffer => new numpy array def  mul(a,  b): 3. alloc: empty buffer        '''Multiplies  the  two  operands.''' 4. copy: copies between existing buffers or arrays return  a  *  b #  allocate  two  random  arrays  that  we  will  be  adding Buffers hold metadata (type, shape, order) so you a  =  numpy.random.rand(50000).astype(numpy.float32) don’t have to provide it. b  =  numpy.random.rand(50000).astype(numpy.float32) #  transfer  data  to  device ctx  =  cl.ctx  =  cl.Context.for_device(0,  0) Implicit queue associated with each context. a_buf  =  ctx.to_device(a) b_buf  =  ctx.to_device(b) dest_buf  =  ctx.alloc(like=a) #  invoke  function  (automatically  specialized  as  needed) ew_op(a_buf,  b_buf,  dest_buf,  plus,              global_size=a.shape,  local_size=(256,)).wait() #  get  results result  =  ctx.from_device(dest_buf) #  check  results print  la.norm(c  -­‐  (a  +  b))
  • 29. Two invocation models @cl.oquence.auto(lambda  a,  b,  dest,  op:  a.shape,  (256,)) 1. Standalone compilation to OpenCL @cl.oquence.fn 2. Integrated into a host language def  ew_op(a,  b,  dest,  op):        '''Parallel  elementwise  binary  operation.''' • Python + pyopencl (w/extensions) + numpy        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  op(a[gid],  b[gid]) @cl.oquence.fn def  plus(a,  b):        '''Adds  the  two  operands.''' Four simple memory management functions return  a  +  b 1. to_device: numpy array => new buffer @cl.oquence.fn 2. from_device: buffer => new numpy array def  mul(a,  b): 3. alloc: empty buffer        '''Multiplies  the  two  operands.''' 4. copy: copies between existing buffers or arrays return  a  *  b #  allocate  two  random  arrays  that  we  will  be  adding Buffers hold metadata (type, shape, order) so you a  =  numpy.random.rand(50000).astype(numpy.float32) don’t have to provide it. b  =  numpy.random.rand(50000).astype(numpy.float32) #  transfer  data  to  device ctx  =  cl.ctx  =  cl.Context.for_device(0,  0) Implicit queue associated with each context. a_buf  =  ctx.to_device(a) b_buf  =  ctx.to_device(b) dest_buf  =  ctx.alloc(like=a) The auto annotation can allow you to hide the #  invoke  function  (automatically  specialized  as  needed) details of parallelization from the user. ew_op(a_buf,  b_buf,  dest_buf,  plus).wait() #  get  results result  =  ctx.from_device(dest_buf) #  check  results print  la.norm(c  -­‐  (a  +  b))
  • 30. Two invocation models @cl.oquence.auto(lambda  a,  b,  dest,  op:  a.shape,  (256,)) 1. Standalone compilation to OpenCL @cl.oquence.fn 2. Integrated into a host language def  ew_op(a,  b,  dest,  op):        '''Parallel  elementwise  binary  operation.''' • Python + pyopencl (w/extensions) + numpy        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  op(a[gid],  b[gid]) @cl.oquence.fn def  plus(a,  b):        '''Adds  the  two  operands.''' Four simple memory management functions return  a  +  b 1. to_device: numpy array => new buffer @cl.oquence.fn 2. from_device: buffer => new numpy array def  mul(a,  b): 3. alloc: empty buffer        '''Multiplies  the  two  operands.''' 4. copy: copies between existing buffers or arrays return  a  *  b #  allocate  two  random  arrays  that  we  will  be  adding Buffers hold metadata (type, shape, order) so you a  =  numpy.random.rand(50000).astype(numpy.float32) don’t have to provide it. b  =  numpy.random.rand(50000).astype(numpy.float32) c  =  numpy.empty_like(a) #  create  an  OpenCL  context Implicit queue associated with each context. ctx  =  cl.ctx  =  cl.Context.for_device(0,  0) #  invoke  function  (automatically  specialized  as  needed) ew_op(In(a),  In(b),  Out(c),  plus).wait() The auto annotation can allow you to hide the details of parallelization from the user. #  check  results print  la.norm(c  -­‐  (a  +  b)) The In, Out and InOut constructs can help automate data movement when convenient.
  • 31. OpenCL //  Parallel  elementwise  sum @cl.oquence.fn __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,   def  ew_op(a,  b,  dest,  op):                                          __global  float*  dest)  {        '''Parallel  elementwise  binary  operation.'''        size_t  gid  =  get_global_id(0);  //  Get  thread  index        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid];        dest[gid]  =  op(a[gid],  b[gid]) } @cl.oquence.fn __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,   def  plus(a,  b):                                          __global  int*  dest)  {        '''Adds  the  two  operands.'''        size_t  gid  =  get_global_id(0);   return  a  +  b        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn def  mul(a,  b): __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,          '''Multiplies  the  two  operands.'''                                          __global  float*  dest)  { return  a  *  b        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,                                            __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable These two libraries express the same thing.        size_t  gid  =  get_global_id(0);   The code will run in precisely the same amount of time.        dest[gid]  =  a[gid]  +  b[gid]; } ... ... //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,                                        __global  float*  dest)  {                                            __global  float*  dest)  {        size_t  gid  =  get_global_id(0);  //  Get  thread  index        dest[gid]  =  a[gid]  *  b[gid]; } __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {
  • 32. OpenCL //  Parallel  elementwise  sum @cl.oquence.fn __kernel  void  sum_ff(__global  float*  a,  __global  float*  b,   def  ew_op(a,  b,  dest,  op):                                          __global  float*  dest)  {        '''Parallel  elementwise  binary  operation.'''        size_t  gid  =  get_global_id(0);  //  Get  thread  index        gid  =  get_global_id(0)                  #  Get  thread  index        dest[gid]  =  a[gid]  +  b[gid];        dest[gid]  =  op(a[gid],  b[gid]) } @cl.oquence.fn __kernel  void  sum_ii(__global  int*  a,  __global  int*  b,   def  plus(a,  b):                                          __global  int*  dest)  {        '''Adds  the  two  operands.'''        size_t  gid  =  get_global_id(0);   return  a  +  b        dest[gid]  =  a[gid]  +  b[gid]; } @cl.oquence.fn def  mul(a,  b): __kernel  void  sum_fi(__global  float*  a,  __global  int*  b,          '''Multiplies  the  two  operands.'''                                          __global  float*  dest)  { return  a  *  b        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; } __kernel  void  sum_df(__global  double*  a,  __global  int*  b,   How?                                          __global  double*  dest)  {        #pragma  OPENCL  EXTENSION  cl_khr_fp64  :  enable • cl.oquence.fn code looks like Python, but no!        size_t  gid  =  get_global_id(0);          dest[gid]  =  a[gid]  +  b[gid]; • Same core type system as OpenCL (C99+) } • Type inference to eliminate type annotations (not dynamic lookups) ... ... • Extension inference to eliminate pragmas • Higher-order functions (inlined at compile-time) //  Parallel  elementwise  product __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ff(__global  float*  a,  __global  float*  b,   • Structural polymorphism                                      __global  float*  dest)  {                                            __global  float*  dest)  { • All functions are generic by default        size_t  gid  =  get_global_id(0);  //  Get  thread  index • You can call a function with any arguments        dest[gid]  =  a[gid]  *  b[gid]; that support the operations it uses. } • __kernel  void  prod(__global  float*  a,  __global  float*  b,   __kernel  void  prod_ii(__global  int*  a,  __global  int*  b,                                        __global  float*  dest)  {                                          __global  int*  dest)  {