SlideShare une entreprise Scribd logo
1  sur  31
Télécharger pour lire hors ligne
dplyr
@romain_francois
• Use R since 2002
• #rcatladies
• R Enthusiast
• R/C++ hero
• Performance
• dplyr
• Occasional comedy
%>%from magrittr
enjoy(cool(bake(shape(beat(append(bowl(rep("flour",
2), "yeast", "water", "milk", "oil"), "flour", until
= "soft"), duration = "3mins"), as = "balls", style =
"slightly-flat"), degrees = 200, duration =
"15mins"), duration = "5mins"))
bowl(rep("flour", 2), "yeast", "water", "milk", "oil") %>%
append("flour", until = "soft") %>%

beat(duration = "3mins") %>%

shape(as = "balls", style = "slightly-flat") %>%

bake(degrees = 200, duration = "15mins") %>%

cool(buns, duration = "5mins") %>%
enjoy()
nycflights13
> flights
Source: local data frame [336,776 x 16]
year month day dep_time dep_delay arr_time arr_delay carrier tailnum flight
1 2013 1 1 517 2 830 11 UA N14228 1545
2 2013 1 1 533 4 850 20 UA N24211 1714
.. ... ... ... ... ... ... ... ... ... ...
Variables not shown: origin (chr), dest (chr), air_time (dbl), distance (dbl),
hour (dbl), minute (dbl)
nycflights13
> glimpse(flights)
Observations: 336,776
Variables: 16
$ year (int) 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 201...
$ month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ day (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ dep_time (int) 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 55...
$ dep_delay (dbl) 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1, ...
$ arr_time (int) 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849, 8...
$ arr_delay (dbl) 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -14,...
$ carrier (chr) "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA...
$ tailnum (chr) "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463...
$ flight (int) 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49,...
$ origin (chr) "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "...
$ dest (chr) "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD", "...
$ air_time (dbl) 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158...
$ distance (dbl) 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 10...
$ hour (dbl) 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, ...
$ minute (dbl) 17, 33, 42, 44, 54, 54, 55, 57, 57, 58, 58, 58, 58, 58, 5...
filterA subset of the rows of the data frame
flights %>%
filter( dep_delay < 10 )
flights %>%
filter( arr_delay < dep_delay )
slicefilter rows by position
flights %>%
slice( 1:10 )
arrangereorder a data frame
flights %>%
filter( hour < 8 ) %>%
arrange( year, month, day )
selectselect certain columns from the data frame
select(flights, year, month, day)
select(flights, year:day)
select(flights, -(year:day))
mutatemodify or create columns based on others
flights %>%
mutate(
gain = arr_delay - dep_delay,
speed = distance / air_time * 60
) %>%
filter( gain > 0 ) %>%
arrange( desc(speed) ) %>%
select( year, month, day, dest, gain, speed )
summarisecollapse a data frame into one row …
flights %>%
summarise(delay = mean(dep_delay, na.rm = TRUE))
flights %>%
filter( dep_delay > 0 ) %>%
summarise(arr_delay = mean(arr_delay, na.rm = TRUE))
group_byGroup observations by one or more variables
flights %>%
group_by( tailnum ) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter( is.finite(delay) ) %>%
arrange( desc(count) )
bind_rows
bind_rows( , )
color num
green 1
yellow 2
red 3
blue 4
pink 5
color num
green 1
yellow 2
color num
red 3
blue 4
pink 5
joins
a <- data_frame(
color = c("green", "yellow", "red"),
num = 1:3
)
b <- data_frame(
color = c("green", "yellow", "pink"),
size = c("S", "M", "L")
)
color num
green 1
yellow 2
red 3
color size
green S
yellow M
pink L
inner_join
color num
green 1
yellow 2
red 3
color size
green S
yellow M
pink L
inner_join( , )
color num size
green 1 S
yellow 2 M
left_join
color num
green 1
yellow 2
red 3
color size
green S
yellow M
pink L
left_join( , )
color num size
green 1 S
yellow 2 M
red 3
right_join
color num
green 1
yellow 2
red 3
color size
green S
yellow M
pink L
right_join( , )
color num size
green 1 S
yellow 2 M
pink L
full_join
color num
green 1
yellow 2
red 3
color size
green S
yellow M
pink L
full_join( , )
color num size
green 1 S
yellow 2 M
red 3
pink L
data_frameJust like data.frame, but better
> data_frame( x = 1:5, y = letters[1:5] ) %>% glimpse
Observations: 5
Variables: 2
$ x (int) 1, 2, 3, 4, 5
$ y (chr) "a", "b", "c", "d", "e"
> data_frame( x = 1:5, y = letters[1:5] , z = x + 1) %>% glimpse
Observations: 5
Variables: 3
$ x (int) 1, 2, 3, 4, 5
$ y (chr) "a", "b", "c", "d", "e"
$ z (dbl) 2, 3, 4, 5, 6
frame_data aka tibble
> frame_data(
+ ~colA, ~colB,
+ "a", 1,
+ "b", 2
+ )
Source: local data frame [2 x 2]
colA colB
(chr) (dbl)
1 a 1
2 b 2
_
g <- c("origin", "dest")
v <- "dep_delay"
flights %>%
group_by( g ) %>%
summarise( result = mean(v, na.rm = TRUE) )
🙀
🙀
g <- c("origin", "dest")
v <- "dep_delay"
flights %>%
group_by_( .dots = g ) %>%
summarise_( .dots =
interp(~ mean(var, na.rm = TRUE), var = as.name(v))
)
Future
• Performance improvements (parallel C++)
• Alternative back ends
• Different type of groupings (e.g. bootstrap)
As soon as we get hoverboard ...
dplyr
Romain François
@romain_francois
romain@r-enthusiasts.com

Contenu connexe

Tendances

The Aggregation Framework
The Aggregation FrameworkThe Aggregation Framework
The Aggregation Framework
MongoDB
 
Cassandra Backups and Restorations Using Ansible (Joshua Wickman, Knewton) | ...
Cassandra Backups and Restorations Using Ansible (Joshua Wickman, Knewton) | ...Cassandra Backups and Restorations Using Ansible (Joshua Wickman, Knewton) | ...
Cassandra Backups and Restorations Using Ansible (Joshua Wickman, Knewton) | ...
DataStax
 
YARN: the Key to overcoming the challenges of broad-based Hadoop Adoption
YARN: the Key to overcoming the challenges of broad-based Hadoop AdoptionYARN: the Key to overcoming the challenges of broad-based Hadoop Adoption
YARN: the Key to overcoming the challenges of broad-based Hadoop Adoption
DataWorks Summit
 
pgpool-II demonstration
pgpool-II demonstrationpgpool-II demonstration
pgpool-II demonstration
elliando dias
 

Tendances (20)

Using ClickHouse for Experimentation
Using ClickHouse for ExperimentationUsing ClickHouse for Experimentation
Using ClickHouse for Experimentation
 
The Aggregation Framework
The Aggregation FrameworkThe Aggregation Framework
The Aggregation Framework
 
Cassandra Backups and Restorations Using Ansible (Joshua Wickman, Knewton) | ...
Cassandra Backups and Restorations Using Ansible (Joshua Wickman, Knewton) | ...Cassandra Backups and Restorations Using Ansible (Joshua Wickman, Knewton) | ...
Cassandra Backups and Restorations Using Ansible (Joshua Wickman, Knewton) | ...
 
How Netflix Uses Druid in Real-time to Ensure a High Quality Streaming Experi...
How Netflix Uses Druid in Real-time to Ensure a High Quality Streaming Experi...How Netflix Uses Druid in Real-time to Ensure a High Quality Streaming Experi...
How Netflix Uses Druid in Real-time to Ensure a High Quality Streaming Experi...
 
ClickHouse materialized views - a secret weapon for high performance analytic...
ClickHouse materialized views - a secret weapon for high performance analytic...ClickHouse materialized views - a secret weapon for high performance analytic...
ClickHouse materialized views - a secret weapon for high performance analytic...
 
(DAT401) Amazon DynamoDB Deep Dive
(DAT401) Amazon DynamoDB Deep Dive(DAT401) Amazon DynamoDB Deep Dive
(DAT401) Amazon DynamoDB Deep Dive
 
Compression Options in Hadoop - A Tale of Tradeoffs
Compression Options in Hadoop - A Tale of TradeoffsCompression Options in Hadoop - A Tale of Tradeoffs
Compression Options in Hadoop - A Tale of Tradeoffs
 
Pinot: Near Realtime Analytics @ Uber
Pinot: Near Realtime Analytics @ UberPinot: Near Realtime Analytics @ Uber
Pinot: Near Realtime Analytics @ Uber
 
Near Real-Time IoT Analytics of Pumping Stations in PowerBI
Near Real-Time IoT Analytics of Pumping Stations in PowerBINear Real-Time IoT Analytics of Pumping Stations in PowerBI
Near Real-Time IoT Analytics of Pumping Stations in PowerBI
 
YARN: the Key to overcoming the challenges of broad-based Hadoop Adoption
YARN: the Key to overcoming the challenges of broad-based Hadoop AdoptionYARN: the Key to overcoming the challenges of broad-based Hadoop Adoption
YARN: the Key to overcoming the challenges of broad-based Hadoop Adoption
 
Sizing MongoDB Clusters
Sizing MongoDB Clusters Sizing MongoDB Clusters
Sizing MongoDB Clusters
 
오픈소스 모니터링 알아보기(Learn about opensource monitoring)
오픈소스 모니터링 알아보기(Learn about opensource monitoring)오픈소스 모니터링 알아보기(Learn about opensource monitoring)
오픈소스 모니터링 알아보기(Learn about opensource monitoring)
 
pgpool-II demonstration
pgpool-II demonstrationpgpool-II demonstration
pgpool-II demonstration
 
Deep Dive: Amazon DynamoDB
Deep Dive: Amazon DynamoDBDeep Dive: Amazon DynamoDB
Deep Dive: Amazon DynamoDB
 
Apache Kafka
Apache KafkaApache Kafka
Apache Kafka
 
A brief introduction to Machine Learning
A brief introduction to Machine LearningA brief introduction to Machine Learning
A brief introduction to Machine Learning
 
Introduction to Kafka and Zookeeper
Introduction to Kafka and ZookeeperIntroduction to Kafka and Zookeeper
Introduction to Kafka and Zookeeper
 
MongoDB vs. Postgres Benchmarks
MongoDB vs. Postgres Benchmarks MongoDB vs. Postgres Benchmarks
MongoDB vs. Postgres Benchmarks
 
Open Source Logging and Monitoring Tools
Open Source Logging and Monitoring ToolsOpen Source Logging and Monitoring Tools
Open Source Logging and Monitoring Tools
 
Introduction to memcached
Introduction to memcachedIntroduction to memcached
Introduction to memcached
 

En vedette

En vedette (10)

Data Manipulation Using R (& dplyr)
Data Manipulation Using R (& dplyr)Data Manipulation Using R (& dplyr)
Data Manipulation Using R (& dplyr)
 
Rデータ処理入門
Rデータ処理入門Rデータ処理入門
Rデータ処理入門
 
Tokyor36
Tokyor36Tokyor36
Tokyor36
 
Introduction to R Short course Fall 2016
Introduction to R Short course Fall 2016Introduction to R Short course Fall 2016
Introduction to R Short course Fall 2016
 
Data manipulation with dplyr
Data manipulation with dplyrData manipulation with dplyr
Data manipulation with dplyr
 
dplyrとは何だったのか
dplyrとは何だったのかdplyrとは何だったのか
dplyrとは何だったのか
 
「plyrパッケージで君も前処理スタ☆」改め「plyrパッケージ徹底入門」
「plyrパッケージで君も前処理スタ☆」改め「plyrパッケージ徹底入門」「plyrパッケージで君も前処理スタ☆」改め「plyrパッケージ徹底入門」
「plyrパッケージで君も前処理スタ☆」改め「plyrパッケージ徹底入門」
 
木と電話と選挙(causalTree)
木と電話と選挙(causalTree)木と電話と選挙(causalTree)
木と電話と選挙(causalTree)
 
R入門(dplyrでデータ加工)-TokyoR42
R入門(dplyrでデータ加工)-TokyoR42R入門(dplyrでデータ加工)-TokyoR42
R入門(dplyrでデータ加工)-TokyoR42
 
Tidyverseとは
TidyverseとはTidyverseとは
Tidyverseとは
 

Similaire à dplyr

Palestra sobre Collections com Python
Palestra sobre Collections com PythonPalestra sobre Collections com Python
Palestra sobre Collections com Python
pugpe
 
Global Change, Species Diversity, and the Future of Marine Ecosystems
Global Change, Species Diversity, and the Future of Marine EcosystemsGlobal Change, Species Diversity, and the Future of Marine Ecosystems
Global Change, Species Diversity, and the Future of Marine Ecosystems
Jarrett Byrnes
 
Danos morais obito dengue hemorragica resp. estado
Danos morais obito dengue hemorragica resp. estadoDanos morais obito dengue hemorragica resp. estado
Danos morais obito dengue hemorragica resp. estado
Informa Jurídico
 

Similaire à dplyr (20)

dplyr and torrents from cpasbien
dplyr and torrents from cpasbiendplyr and torrents from cpasbien
dplyr and torrents from cpasbien
 
dplyr
dplyrdplyr
dplyr
 
Τα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την PythonΤα Πολύ Βασικά για την Python
Τα Πολύ Βασικά για την Python
 
Super Advanced Python –act1
Super Advanced Python –act1Super Advanced Python –act1
Super Advanced Python –act1
 
sopa de pollo para el alma latina
sopa de pollo para el alma latinasopa de pollo para el alma latina
sopa de pollo para el alma latina
 
Elixir
ElixirElixir
Elixir
 
Sopa de pollo para el alma Latina
Sopa de pollo para el alma LatinaSopa de pollo para el alma Latina
Sopa de pollo para el alma Latina
 
R programming language
R programming languageR programming language
R programming language
 
Basics
BasicsBasics
Basics
 
r studio presentation.pptx
r studio presentation.pptxr studio presentation.pptx
r studio presentation.pptx
 
r studio presentation.pptx
r studio presentation.pptxr studio presentation.pptx
r studio presentation.pptx
 
Evergreen trails master plan community meeting 1 boards
Evergreen trails master plan community meeting 1 boardsEvergreen trails master plan community meeting 1 boards
Evergreen trails master plan community meeting 1 boards
 
dplyr use case
dplyr use casedplyr use case
dplyr use case
 
Oceans 2019 tutorial-geophysical-nav_7-updated
Oceans 2019 tutorial-geophysical-nav_7-updatedOceans 2019 tutorial-geophysical-nav_7-updated
Oceans 2019 tutorial-geophysical-nav_7-updated
 
Al Fazl International Weekly26 June 2015
Al Fazl International  Weekly26 June 2015Al Fazl International  Weekly26 June 2015
Al Fazl International Weekly26 June 2015
 
ΠΛΗ31 ΜΑΘΗΜΑ 2.2 (ΕΚΤΥΠΩΣΗ)
ΠΛΗ31 ΜΑΘΗΜΑ 2.2 (ΕΚΤΥΠΩΣΗ)ΠΛΗ31 ΜΑΘΗΜΑ 2.2 (ΕΚΤΥΠΩΣΗ)
ΠΛΗ31 ΜΑΘΗΜΑ 2.2 (ΕΚΤΥΠΩΣΗ)
 
Palestra sobre Collections com Python
Palestra sobre Collections com PythonPalestra sobre Collections com Python
Palestra sobre Collections com Python
 
Global Change, Species Diversity, and the Future of Marine Ecosystems
Global Change, Species Diversity, and the Future of Marine EcosystemsGlobal Change, Species Diversity, and the Future of Marine Ecosystems
Global Change, Species Diversity, and the Future of Marine Ecosystems
 
Encuesta de valores de los mexicanos...
Encuesta de valores de los mexicanos...Encuesta de valores de los mexicanos...
Encuesta de valores de los mexicanos...
 
Danos morais obito dengue hemorragica resp. estado
Danos morais obito dengue hemorragica resp. estadoDanos morais obito dengue hemorragica resp. estado
Danos morais obito dengue hemorragica resp. estado
 

Plus de Romain Francois

Rcpp: Seemless R and C++
Rcpp: Seemless R and C++Rcpp: Seemless R and C++
Rcpp: Seemless R and C++
Romain Francois
 
RProtoBuf: protocol buffers for R
RProtoBuf: protocol buffers for RRProtoBuf: protocol buffers for R
RProtoBuf: protocol buffers for R
Romain Francois
 
Rcpp: Seemless R and C++
Rcpp: Seemless R and C++Rcpp: Seemless R and C++
Rcpp: Seemless R and C++
Romain Francois
 
Rcpp: Seemless R and C++
Rcpp: Seemless R and C++Rcpp: Seemless R and C++
Rcpp: Seemless R and C++
Romain Francois
 

Plus de Romain Francois (18)

R/C++
R/C++R/C++
R/C++
 
user2015 keynote talk
user2015 keynote talkuser2015 keynote talk
user2015 keynote talk
 
SevillaR meetup: dplyr and magrittr
SevillaR meetup: dplyr and magrittrSevillaR meetup: dplyr and magrittr
SevillaR meetup: dplyr and magrittr
 
R/C++ talk at earl 2014
R/C++ talk at earl 2014R/C++ talk at earl 2014
R/C++ talk at earl 2014
 
Rcpp11 genentech
Rcpp11 genentechRcpp11 genentech
Rcpp11 genentech
 
Rcpp11 useR2014
Rcpp11 useR2014Rcpp11 useR2014
Rcpp11 useR2014
 
Rcpp11
Rcpp11Rcpp11
Rcpp11
 
R and C++
R and C++R and C++
R and C++
 
R and cpp
R and cppR and cpp
R and cpp
 
Rcpp attributes
Rcpp attributesRcpp attributes
Rcpp attributes
 
Rcpp is-ready
Rcpp is-readyRcpp is-ready
Rcpp is-ready
 
Rcpp
RcppRcpp
Rcpp
 
Integrating R with C++: Rcpp, RInside and RProtoBuf
Integrating R with C++: Rcpp, RInside and RProtoBufIntegrating R with C++: Rcpp, RInside and RProtoBuf
Integrating R with C++: Rcpp, RInside and RProtoBuf
 
Object Oriented Design(s) in R
Object Oriented Design(s) in RObject Oriented Design(s) in R
Object Oriented Design(s) in R
 
Rcpp: Seemless R and C++
Rcpp: Seemless R and C++Rcpp: Seemless R and C++
Rcpp: Seemless R and C++
 
RProtoBuf: protocol buffers for R
RProtoBuf: protocol buffers for RRProtoBuf: protocol buffers for R
RProtoBuf: protocol buffers for R
 
Rcpp: Seemless R and C++
Rcpp: Seemless R and C++Rcpp: Seemless R and C++
Rcpp: Seemless R and C++
 
Rcpp: Seemless R and C++
Rcpp: Seemless R and C++Rcpp: Seemless R and C++
Rcpp: Seemless R and C++
 

Dernier

+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
?#DUbAI#??##{{(☎️+971_581248768%)**%*]'#abortion pills for sale in dubai@
 
Histor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slideHistor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slide
vu2urc
 

Dernier (20)

AWS Community Day CPH - Three problems of Terraform
AWS Community Day CPH - Three problems of TerraformAWS Community Day CPH - Three problems of Terraform
AWS Community Day CPH - Three problems of Terraform
 
Understanding Discord NSFW Servers A Guide for Responsible Users.pdf
Understanding Discord NSFW Servers A Guide for Responsible Users.pdfUnderstanding Discord NSFW Servers A Guide for Responsible Users.pdf
Understanding Discord NSFW Servers A Guide for Responsible Users.pdf
 
GenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day PresentationGenCyber Cyber Security Day Presentation
GenCyber Cyber Security Day Presentation
 
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
+971581248768>> SAFE AND ORIGINAL ABORTION PILLS FOR SALE IN DUBAI AND ABUDHA...
 
Developing An App To Navigate The Roads of Brazil
Developing An App To Navigate The Roads of BrazilDeveloping An App To Navigate The Roads of Brazil
Developing An App To Navigate The Roads of Brazil
 
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law DevelopmentsTrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
TrustArc Webinar - Stay Ahead of US State Data Privacy Law Developments
 
Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...Apidays New York 2024 - The value of a flexible API Management solution for O...
Apidays New York 2024 - The value of a flexible API Management solution for O...
 
Boost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdfBoost Fertility New Invention Ups Success Rates.pdf
Boost Fertility New Invention Ups Success Rates.pdf
 
Histor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slideHistor y of HAM Radio presentation slide
Histor y of HAM Radio presentation slide
 
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
Mastering MySQL Database Architecture: Deep Dive into MySQL Shell and MySQL R...
 
TrustArc Webinar - Unlock the Power of AI-Driven Data Discovery
TrustArc Webinar - Unlock the Power of AI-Driven Data DiscoveryTrustArc Webinar - Unlock the Power of AI-Driven Data Discovery
TrustArc Webinar - Unlock the Power of AI-Driven Data Discovery
 
Scaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organizationScaling API-first – The story of a global engineering organization
Scaling API-first – The story of a global engineering organization
 
Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024Axa Assurance Maroc - Insurer Innovation Award 2024
Axa Assurance Maroc - Insurer Innovation Award 2024
 
Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024Finology Group – Insurtech Innovation Award 2024
Finology Group – Insurtech Innovation Award 2024
 
2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...2024: Domino Containers - The Next Step. News from the Domino Container commu...
2024: Domino Containers - The Next Step. News from the Domino Container commu...
 
Real Time Object Detection Using Open CV
Real Time Object Detection Using Open CVReal Time Object Detection Using Open CV
Real Time Object Detection Using Open CV
 
Handwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed textsHandwritten Text Recognition for manuscripts and early printed texts
Handwritten Text Recognition for manuscripts and early printed texts
 
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
Apidays Singapore 2024 - Building Digital Trust in a Digital Economy by Veron...
 
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...
Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...Workshop - Best of Both Worlds_ Combine  KG and Vector search for  enhanced R...
Workshop - Best of Both Worlds_ Combine KG and Vector search for enhanced R...
 
presentation ICT roal in 21st century education
presentation ICT roal in 21st century educationpresentation ICT roal in 21st century education
presentation ICT roal in 21st century education
 

dplyr

  • 2. • Use R since 2002 • #rcatladies • R Enthusiast • R/C++ hero • Performance • dplyr • Occasional comedy
  • 3.
  • 4.
  • 6. enjoy(cool(bake(shape(beat(append(bowl(rep("flour", 2), "yeast", "water", "milk", "oil"), "flour", until = "soft"), duration = "3mins"), as = "balls", style = "slightly-flat"), degrees = 200, duration = "15mins"), duration = "5mins")) bowl(rep("flour", 2), "yeast", "water", "milk", "oil") %>% append("flour", until = "soft") %>%
 beat(duration = "3mins") %>%
 shape(as = "balls", style = "slightly-flat") %>%
 bake(degrees = 200, duration = "15mins") %>%
 cool(buns, duration = "5mins") %>% enjoy()
  • 7. nycflights13 > flights Source: local data frame [336,776 x 16] year month day dep_time dep_delay arr_time arr_delay carrier tailnum flight 1 2013 1 1 517 2 830 11 UA N14228 1545 2 2013 1 1 533 4 850 20 UA N24211 1714 .. ... ... ... ... ... ... ... ... ... ... Variables not shown: origin (chr), dest (chr), air_time (dbl), distance (dbl), hour (dbl), minute (dbl)
  • 8. nycflights13 > glimpse(flights) Observations: 336,776 Variables: 16 $ year (int) 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 201... $ month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ day (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... $ dep_time (int) 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 55... $ dep_delay (dbl) 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1, ... $ arr_time (int) 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849, 8... $ arr_delay (dbl) 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -14,... $ carrier (chr) "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA... $ tailnum (chr) "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463... $ flight (int) 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49,... $ origin (chr) "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "... $ dest (chr) "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD", "... $ air_time (dbl) 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158... $ distance (dbl) 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 10... $ hour (dbl) 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, ... $ minute (dbl) 17, 33, 42, 44, 54, 54, 55, 57, 57, 58, 58, 58, 58, 58, 5...
  • 9. filterA subset of the rows of the data frame flights %>% filter( dep_delay < 10 ) flights %>% filter( arr_delay < dep_delay )
  • 10. slicefilter rows by position flights %>% slice( 1:10 )
  • 11. arrangereorder a data frame flights %>% filter( hour < 8 ) %>% arrange( year, month, day )
  • 12. selectselect certain columns from the data frame select(flights, year, month, day) select(flights, year:day) select(flights, -(year:day))
  • 13. mutatemodify or create columns based on others flights %>% mutate( gain = arr_delay - dep_delay, speed = distance / air_time * 60 ) %>% filter( gain > 0 ) %>% arrange( desc(speed) ) %>% select( year, month, day, dest, gain, speed )
  • 14. summarisecollapse a data frame into one row … flights %>% summarise(delay = mean(dep_delay, na.rm = TRUE)) flights %>% filter( dep_delay > 0 ) %>% summarise(arr_delay = mean(arr_delay, na.rm = TRUE))
  • 15. group_byGroup observations by one or more variables flights %>% group_by( tailnum ) %>% summarise( count = n(), dist = mean(distance, na.rm = TRUE), delay = mean(arr_delay, na.rm = TRUE) ) %>% filter( is.finite(delay) ) %>% arrange( desc(count) )
  • 16. bind_rows bind_rows( , ) color num green 1 yellow 2 red 3 blue 4 pink 5 color num green 1 yellow 2 color num red 3 blue 4 pink 5
  • 17. joins a <- data_frame( color = c("green", "yellow", "red"), num = 1:3 ) b <- data_frame( color = c("green", "yellow", "pink"), size = c("S", "M", "L") ) color num green 1 yellow 2 red 3 color size green S yellow M pink L
  • 18. inner_join color num green 1 yellow 2 red 3 color size green S yellow M pink L inner_join( , ) color num size green 1 S yellow 2 M
  • 19. left_join color num green 1 yellow 2 red 3 color size green S yellow M pink L left_join( , ) color num size green 1 S yellow 2 M red 3
  • 20. right_join color num green 1 yellow 2 red 3 color size green S yellow M pink L right_join( , ) color num size green 1 S yellow 2 M pink L
  • 21. full_join color num green 1 yellow 2 red 3 color size green S yellow M pink L full_join( , ) color num size green 1 S yellow 2 M red 3 pink L
  • 22. data_frameJust like data.frame, but better > data_frame( x = 1:5, y = letters[1:5] ) %>% glimpse Observations: 5 Variables: 2 $ x (int) 1, 2, 3, 4, 5 $ y (chr) "a", "b", "c", "d", "e" > data_frame( x = 1:5, y = letters[1:5] , z = x + 1) %>% glimpse Observations: 5 Variables: 3 $ x (int) 1, 2, 3, 4, 5 $ y (chr) "a", "b", "c", "d", "e" $ z (dbl) 2, 3, 4, 5, 6
  • 23. frame_data aka tibble > frame_data( + ~colA, ~colB, + "a", 1, + "b", 2 + ) Source: local data frame [2 x 2] colA colB (chr) (dbl) 1 a 1 2 b 2
  • 24. _
  • 25. g <- c("origin", "dest") v <- "dep_delay" flights %>% group_by( g ) %>% summarise( result = mean(v, na.rm = TRUE) ) 🙀 🙀
  • 26. g <- c("origin", "dest") v <- "dep_delay" flights %>% group_by_( .dots = g ) %>% summarise_( .dots = interp(~ mean(var, na.rm = TRUE), var = as.name(v)) )
  • 27.
  • 28.
  • 29.
  • 30. Future • Performance improvements (parallel C++) • Alternative back ends • Different type of groupings (e.g. bootstrap) As soon as we get hoverboard ...