SlideShare une entreprise Scribd logo
1  sur  25
Télécharger pour lire hors ligne
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Nested	
  Types	
  in	
  Impala
Alex	
  Behm	
  //	
  Cloudera,	
  Inc.	
  
Marcel	
  Kornacker	
  //	
  Cloudera,	
  Inc.	
  
Skye	
  Wanderman-­‐Milne	
  //	
  Cloudera,	
  Inc.
Impala	
  Meetup	
  03/24/2015
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Design	
  Goals
• Goals	
  
• Support	
  for	
  nested	
  data	
  types:	
  struct,	
  map,	
  array	
  
• Full	
  expressiveness	
  of	
  SQL	
  with	
  nested	
  structures	
  
• v1	
  Prioritization	
  
• Focus	
  on	
  SELECT	
  queries	
  (INSERT	
  in	
  later	
  releases)	
  
• Focus	
  on	
  native	
  Parquet	
  and	
  Avro	
  formats	
  (XML,	
  JSON,	
  etc	
  in	
  later	
  releases)	
  
• Focus	
  on	
  built-­‐in	
  language	
  expressiveness	
  (UDTF	
  extensibility	
  in	
  later	
  releases)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Example	
  Schema
CREATE TABLE Customers {
id BIGINT,
address STRUCT<
city: STRING,
zip: INT
>
orders ARRAY<STRUCT<
txn_time: TIMESTAMP,
cc: BIGINT,
items: ARRAY<STRUCT<
item_no: STRING,
price: DECIMAL(9,2)
>>
>>
preferred_cc BIGINT
}
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Syntax	
  Extensions
• Path	
  expressions	
  extend	
  column	
  references	
  to	
  scalars	
  (nested	
  structs)	
  
• Can	
  appear	
  anywhere	
  a	
  conventional	
  column	
  reference	
  is	
  used	
  
• Collections	
  (maps	
  and	
  arrays)	
  are	
  exposed	
  like	
  sub-­‐tables	
  
• Use	
  FROM	
  clause	
  to	
  specify	
  which	
  collections	
  to	
  read	
  like	
  conventional	
  tables	
  
• Can	
  use	
  JOIN	
  conditions	
  to	
  express	
  join	
  relationship	
  (default	
  is	
  INNER	
  JOIN)
Find	
  the	
  ids	
  and	
  city	
  of	
  customers	
  who	
  live	
  in	
  the	
  zip	
  code	
  94305:	
  
SELECT	
  id,	
  address.city	
  FROM	
  customers	
  WHERE	
  address.zip	
  =	
  94305
Find	
  all	
  orders	
  that	
  were	
  paid	
  for	
  with	
  a	
  customer’s	
  preferred	
  credit	
  card:	
  
SELECT	
  o.txn_id	
  FROM	
  customers	
  c,	
  c.orders	
  o	
  WHERE	
  o.cc	
  =	
  c.preferred_cc	
  	
  
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Referencing	
  Arrays	
  &	
  Maps
• Basic	
  idea:	
  Flatten	
  nested	
  collections	
  referenced	
  in	
  the	
  FROM	
  clause	
  
• Can	
  be	
  thought	
  of	
  as	
  an	
  implicit	
  join	
  on	
  the	
  parent/child	
  relationship
SELECT	
  c.id,	
  o.txn_id	
  FROM	
  customers	
  c,	
  c.orders	
  o
c.id o.txn_id
100 203
100 305
100 507
… …
101 10056
101 10
… …
id	
  of	
  a	
  customer	
  repeated	
  
for	
  every	
  order
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Referencing	
  Arrays	
  &	
  Maps
SELECT	
  c.id,	
  o.txn_id	
  FROM	
  customers	
  c,	
  c.orders	
  o	
  
SELECT	
  c.id,	
  o.txn_id	
  FROM	
  customer	
  c	
  INNER	
  JOIN	
  c.orders	
  o
Returns	
  customer/order	
  data	
  for	
  customers	
  that	
  have	
  at	
  least	
  one	
  order
SELECT	
  c.id,	
  o.txn_id	
  FROM	
  customers	
  c	
  LEFT	
  OUTER	
  JOIN	
  c.orders	
  o
Also	
  returns	
  customers	
  with	
  no	
  orders	
  (with	
  order	
  fields	
  NULL)
SELECT	
  c.id,	
  o.txn_id	
  FROM	
  customers	
  c	
  LEFT	
  ANTI	
  JOIN	
  c.orders	
  o
Find	
  all	
  customers	
  that	
  have	
  no	
  orders
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Motivation	
  for	
  Advanced	
  Querying	
  Capabilities
• Count	
  the	
  number	
  of	
  orders	
  per	
  customer	
  
• Count	
  the	
  number	
  of	
  items	
  per	
  order	
  
• Impractical	
  à Requires	
  unique	
  id	
  at	
  every	
  nesting	
  level	
  
• Information	
  is	
  already	
  expressed	
  in	
  nesting	
  relationship!	
  
• What	
  about	
  even	
  more	
  interesting	
  queries?	
  
• Get	
  the	
  number	
  of	
  orders	
  and	
  the	
  average	
  item	
  price	
  per	
  customer?	
  
• “Group	
  by”	
  multiple	
  nesting	
  levels
SELECT	
  COUNT(*)	
  FROM	
  customers	
  c,	
  c.orders	
  o	
  GROUP	
  BY	
  c.id
SELECT	
  COUNT(*)	
  FROM	
  customers.orders	
  o,	
  o.items	
  GROUP	
  BY	
  ???
Must	
  be	
  unique
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Advanced	
  Querying:	
  Correlated	
  Table	
  References
• Count	
  the	
  number	
  of	
  orders	
  per	
  customer	
  
• Count	
  the	
  number	
  of	
  items	
  per	
  order	
  
• Get	
  the	
  number	
  of	
  orders	
  and	
  the	
  average	
  item	
  price	
  per	
  customer
SELECT	
  cnt	
  FROM	
  customers	
  c,	
  (SELECT	
  COUNT(*)	
  cnt	
  FROM	
  c.orders)	
  v
SELECT	
  cnt	
  FROM	
  customers.orders	
  o,	
  (SELECT	
  COUNT(*)	
  cnt	
  FROM	
  o.items)	
  v
SELECT	
  c.id,	
  cnt,	
  avgp	
  
FROM	
  customer	
  c,	
  
	
  	
  (SELECT	
  count(1)	
  cnt	
  FROM	
  c.orders)	
  v1,	
  
	
  	
  (SELECT	
  avg(price)	
  avpg	
  FROM	
  c.orders.items)	
  v2
Correlated	
  reference	
  to	
  “c”	
  
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Advanced	
  Querying:	
  Correlated	
  Table	
  References
• Full	
  expressibility	
  
• Arbitrary	
  SQL	
  allowed	
  in	
  inline	
  views	
  and	
  subqueries	
  with	
  correlated	
  table	
  refs	
  
• Correlated	
  subqueries	
  transformed	
  into	
  joins	
  if	
  possible	
  
• Exploits	
  nesting	
  relationship	
  
• No	
  need	
  for	
  stored	
  unique	
  ids	
  at	
  various	
  levels	
  
• Similar	
  to	
  standard	
  SQL	
  ‘LATERAL’,	
  ‘CROSS	
  APPLY’,	
  ‘OUTER	
  CROSS	
  APPLY’	
  
• Goes	
  beyond	
  standard	
  in	
  some	
  aspects	
  (semi/anti	
  variants)	
  
• Not	
  as	
  general	
  as	
  SQL	
  standard	
  (limited	
  correlations)
SELECT	
  id	
  FROM	
  customers	
  c	
  WHERE	
  EXISTS	
  
	
  	
  (SELECT	
  1	
  FROM	
  c.orders	
  o,	
  o.items	
  i	
  where	
  o.cc	
  =	
  c.preferred_cc	
  and	
  i.price	
  >	
  100)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Execution	
  Model
Design	
  sweet	
  spot:	
  small/medium	
  sized	
  collections	
  (max	
  few	
  hundreds	
  of	
  MB)	
  
• Built-­‐in	
  limitation	
  on	
  the	
  size	
  of	
  nested	
  collections	
  (TBD)	
  
• Huge	
  collections	
  not	
  expected	
  to	
  be	
  performant	
  and	
  rare	
  
Execution	
  Overview	
  
• Scans	
  materialize	
  minimal	
  nested	
  structure	
  in	
  memory	
  
• Three	
  new	
  exec	
  nodes	
  
• Subplan:	
  Executes	
  its	
  subplan	
  tree	
  for	
  each	
  input	
  row	
  and	
  returns	
  the	
  rows	
  
produced	
  by	
  the	
  subplan	
  
• Nested	
  Row	
  Src:	
  Returns	
  the	
  current	
  input	
  row	
  of	
  its	
  parent	
  Subplan	
  node	
  
• Unnest:	
  Scans	
  an	
  array	
  slot	
  of	
  the	
  current	
  input	
  row	
  of	
  its	
  parent	
  Subplan	
  node	
  or	
  
of	
  an	
  output	
  row	
  from	
  its	
  child	
  plan	
  node,	
  returning	
  one	
  row	
  per	
  array	
  element.	
  
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Example	
  A
SELECT	
  count(*)	
  FROM	
  customer.orders.items
Scan	
  
orders.customer.items
Aggregate	
  
count(*)
Count	
  the	
  total	
  number	
  of	
  items
Exchange	
  
(to	
  client)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Example	
  B
SELECT	
  c.id,	
  cnt	
  
FROM	
  customer	
  c,	
  
JOIN	
  (SELECT	
  count(*)	
  cnt	
  FROM	
  c.orders)	
  v
Scan	
  c
Subplan
materialize	
  
c.orders
Unnest	
  
c.orders
Aggregate	
  
count(*)
Subplan	
  Pseudo-­‐Code	
  
result	
  =	
  {}	
  
for	
  each	
  c	
  in	
  input	
  
	
  	
  set	
  c	
  in	
  dependent	
  nodes	
  
	
  	
  result	
  +=	
  rhsPlan.exec()	
  
return	
  result
Nested	
  
Row	
  Src
Cross	
  Join
set	
  c	
  in	
  dependent	
  nodes
Count	
  the	
  number	
  of	
  orders	
  per	
  customer
Exchange	
  
(to	
  client)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Example	
  C
SELECT	
  c.id,	
  cnt,	
  avgp	
  
FROM	
  customer	
  c,	
  
	
  	
  (SELECT	
  count(*)	
  cnt	
  FROM	
  c.orders)	
  v1	
  
	
  	
  (SELECT	
  avg(price)	
  avpg	
  FROM	
  c.orders.items)	
  v2
Scan	
  c
Subplan
Unnest	
  
c.orders
Aggregate	
  
count(*)
Unnest	
  
c.orders.item
Aggregate	
  
avg(price)
Cross	
  Join
Return	
  the	
  number	
  of	
  orders	
  and	
  
the	
  average	
  item	
  price	
  per	
  customer
Nested	
  Row	
  
Src
Cross	
  Join
Exchange	
  
(to	
  client)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Example	
  D
SELECT	
  c.id,	
  cnt,	
  avgp	
  
FROM	
  customer	
  c,	
  
	
  	
  (SELECT	
  count(*)	
  cnt	
  FROM	
  c.orders	
  o	
  
	
  	
  	
  WHERE	
  (SELECT	
  sum(price)	
  FROM	
  o.items)	
  >	
  100))	
  v
Scan	
  c
Subplan
Unnest	
  
c.orders
Aggregate	
  
count(*)
Unnest	
  
o.items
Aggregate	
  
sum(price)	
  
sum(price)>100
Cross	
  Join
For	
  each	
  customer,	
  return	
  the	
  number	
  of	
  orders	
  
whose	
  total	
  item	
  price	
  exceeds	
  >	
  100
Subplan
Nested	
  Row	
  
Src:	
  c
Nested	
  Row	
  
Src:	
  o
Cross	
  Join
Exchange	
  
(to	
  client)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Future	
  Work
• Syntax	
  extensions	
  
• Performance	
  improvements	
  
• Parquet:	
  Scan	
  columns	
  directly	
  in	
  “unnest”,	
  avoid	
  collection	
  materialization	
  
• Codegen	
  subplans	
  
• INSERT	
  queries,	
  e.g.,	
  convert	
  flat	
  data	
  into	
  nested	
  data	
  
• UDTF	
  support	
  
• More	
  formats:	
  JSON,	
  XML,	
  etc.
SELECT	
   c.id,	
  
	
  	
   	
   count(c.orders),	
  
	
   	
   avg(c.orders.items.price)	
  
FROM	
  customer	
  c
SELECT	
  c.id,	
  cnt,	
  avgp	
  
FROM	
  customer	
  c,	
  
	
  	
  (SELECT	
  count(1)	
  cnt	
  FROM	
  c.orders)	
  v1	
  
	
  	
  (SELECT	
  avg(price)	
  avpg	
  FROM	
  c.orders.items)	
  v2
Rewrite
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Thank	
  you
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Appendix
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Comparison	
  of	
  Impala	
  and	
  HiveQL
• Impala’s	
  syntax	
  provides	
  a	
  superset	
  of	
  Hive’s	
  functionality	
  
• HiveQL	
  has	
  similar	
  path	
  expressions	
  but	
  with	
  restrictions	
  
• Must	
  use	
  LATERAL	
  VIEW	
  in	
  FROM	
  clause;	
  more	
  verbose	
  syntax	
  
• LATERAL	
  VIEWs	
  themselves	
  have	
  many	
  restrictions,	
  no	
  arbitrary	
  SQL	
  
• Requires	
  complex	
  joins	
  or	
  unique	
  ids	
  at	
  various	
  nesting	
  levels	
  for	
  expressing	
  even	
  simple	
  queries	
  (e.g.,	
  
find	
  number	
  of	
  orders	
  per	
  customer)	
  
• Does	
  not	
  provide	
  similar	
  inline	
  view	
  and	
  subquery	
  capabilities
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Syntax	
  vs.	
  Hive	
  Syntax
SELECT	
  …	
  FROM	
  customer	
  c,	
  c.orders	
  o,	
  o.items	
  i
SELECT	
  …	
  FROM	
  customer	
  c	
  
LATERAL	
  VIEW	
  explode(c.orders)	
  as	
  (c1,	
  c2…)	
  
LATERAL	
  VIEW	
  explode(c.orders.items)	
  as	
  (c1,	
  c2…)
SELECT	
  …	
  FROM	
  customer	
  c	
  
LEFT	
  JOIN	
  c.orders	
  LEFT	
  JOIN	
  c.orders.items
SELECT	
  …	
  FROM	
  customer	
  c	
  
OUTER	
  LATERAL	
  VIEW	
  explode(c.orders)	
  as	
  (c1,	
  …)	
  
OUTER	
  LATERAL	
  VIEW	
  explode(c.orders.items)	
  as	
  (c1,	
  …)
SELECT	
  c.id	
  FROM	
  customer	
  c,	
  
LEFT	
  ANTI	
  JOIN	
  (SELECT	
  oid	
  FROM	
  c.orders)	
  v	
  
ON	
  c.preferred_cc	
  =	
  v.cc
SELECT	
  c.id	
  FROM	
  customer	
  c	
  
WHERE	
  NOT	
  EXISTS	
  (SELECT	
  oid	
  FROM	
  c.orders	
  WHERE	
  
c.preferred_cc	
  =	
  orders.cc)
No	
  convenient/performant	
  equivalent	
  in	
  Hive.
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Syntax	
  vs.	
  Hive	
  Syntax
SELECT	
  …	
  
FROM	
  customer	
  c	
  
LATERAL	
  VIEW	
  MY_UDTF(c.orders)	
  as	
  (c1,	
  c2…)
Impala	
  will	
  not	
  support	
  Hive’s	
  builtin	
  or	
  user-­‐defined	
  
table	
  generating	
  functions	
  for	
  now.
SELECT	
  …	
  
FROM	
  customer	
  c	
  
LATERAL	
  VIEW	
  json_tuple(c.json_str,	
  …)	
  as	
  (c1,	
  c2…)
SELECT	
  count(c.orders)	
  FROM	
  customer	
  c
SELECT	
  cnt	
  FROM	
  customer	
  c,	
  
JOIN	
  (SELECT	
  count(1)	
  cnt	
  FROM	
  c.orders)	
  v1
SELECT	
  count(1)	
  
FROM	
  customer	
  c	
  
LATERAL	
  VIEW	
  explode(c.orders)	
  as	
  (c1,	
  c2…)	
  
GROUP	
  BY	
  c.orders	
  
(in	
  absence	
  of	
  a	
  unique	
  key	
  in	
  ‘customer’)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Syntax	
  vs.	
  Hive	
  Syntax
SELECT	
   c.id,	
  
	
  	
   	
   count(c.orders),	
  
	
   	
   avg(c.orders.items.price)	
  
FROM	
  customer	
  c
SELECT	
  c.id,	
  cnt,	
  avgp	
  
FROM	
  customer	
  c,	
  
JOIN	
  (SELECT	
  count(1)	
  cnt	
  FROM	
  c.orders)	
  v1	
  
JOIN	
  (SELECT	
  avg(price)	
  avpg	
  FROM	
  c.orders.items)	
  v2
No	
  convenient/performant	
  equivalent	
  in	
  Hive.
• Impala	
  is	
  more	
  expressive,	
  but	
  less	
  extensible	
  until	
  UDTFs	
  are	
  supported	
  
• More	
  join	
  types:	
  inner/outer/semi/anti	
  
• Full	
  SQL	
  block	
  inside	
  correlated	
  inline	
  view	
  
• Hive	
  more	
  extensible	
  (UDTFs),	
  has	
  builtin	
  UDTFs	
  
• Hive	
  Lateral	
  View	
  very	
  rigid,	
  no	
  arbitrary	
  SQL	
  inside
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Nested	
  Types	
  in	
  Action
Josh	
  Will’s	
  Blog	
  post	
  on	
  analyzing	
  misspelled	
  queries	
  
http://blog.cloudera.com/blog/2014/08/how-­‐to-­‐count-­‐events-­‐like-­‐a-­‐data-­‐scientist/	
  
• Goal:	
  Rudimentary	
  spell	
  checker	
  based	
  on	
  counting	
  query/click	
  events	
  
• Problem:	
  Cross	
  referencing	
  items	
  in	
  multiple	
  nested	
  collections	
  
• Representative	
  of	
  many	
  machine	
  learning	
  tasks	
  (Josh	
  tells	
  me)	
  
• Goal	
  was	
  not	
  naturally	
  achievable	
  with	
  Hive	
  	
  
• Josh	
  implemented	
  a	
  custom	
  Hive	
  extension	
  “WITHIN”	
  
• How	
  can	
  Impala	
  serve	
  this	
  use	
  case?
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Nested	
  Types	
  in	
  Action
account_id:	
  bigint	
  
search_events:	
  array<struct<	
  
	
  	
  event_id:	
  bigint	
  
	
  	
  query:	
  string	
  
	
  	
  tstamp_sec:	
  bigint	
  
	
  	
  ...	
  
>>	
  
install_events:	
  array<struct<	
  
	
  	
  event_id:	
  bigint	
  
	
  	
  search_event_id:	
  bigint	
  
	
  	
  app_id:	
  bigint	
  
	
  	
  ...	
  
>>
SELECT	
  a.qw,	
  a.qr,	
  count(*)	
  as	
  cnt	
  
FROM	
  sessions	
  
LATERAL	
  VIEW	
  WITHIN(	
  
	
  	
  "SELECT	
  bad.query	
  qw,	
  good.query	
  qr	
  
	
  	
  FROM	
  t1	
  as	
  bad,	
  t1	
  as	
  good	
  
	
  	
  WHERE	
  bad.tstamp_sec	
  <	
  good.tstamp_sec	
  
	
  	
  AND	
  good.tstamp_sec	
  -­‐	
  bad.tstamp_sec	
  <	
  30	
  
	
  	
  AND	
  bad.event_id	
  NOT	
  IN	
  (select	
  search_event_id	
  FROM	
  t2)	
  
	
  	
  AND	
  good.event_id	
  IN	
  (select	
  search_event_id	
  FROM	
  t2)",	
  
search_events,	
  install_events)	
  a	
  
GROUP	
  BY	
  a.qw,	
  a.qr;
SELECT	
  bad.query,	
  good.query,	
  count(*)	
  as	
  cnt	
  
FROM	
  sessions	
  s,	
  
	
  	
  s.search_events	
  bad,	
  
	
  	
  s.search_events	
  good,	
  
WHERE	
  bad.tstamp_sec	
  <	
  good.tstamp_sec	
  
	
  	
  AND	
  good.tstamp_sec	
  -­‐	
  bad.tstamp_sec	
  <	
  30	
  
	
  	
  AND	
  bad.event_id	
  NOT	
  IN	
  (select	
  search_event_id	
  FROM	
  s.install_events)	
  
	
  	
  AND	
  good.event_id	
  IN	
  (select	
  search_event_id	
  FROM	
  s.install_events),	
  
GROUP	
  BY	
  bad.query,	
  good.query;
Josh’s	
  HiveQL	
  extension
Impala	
  SQL
Schema
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Nested	
  Types	
  in	
  Action
SELECT	
  bad.query,	
  good.query,	
  count(*)	
  as	
  cnt	
  
FROM	
  sessions	
  s,	
  
	
  	
  s.search_events	
  bad,	
  
	
  	
  s.search_events	
  good,	
  
WHERE	
  bad.tstamp_sec	
  <	
  good.tstamp_sec	
  
	
  	
  AND	
  good.tstamp_sec	
  -­‐	
  bad.tstamp_sec	
  <	
  30	
  
	
  	
  AND	
  bad.event_id	
  NOT	
  IN	
  (select	
  search_event_id	
  FROM	
  s.install_events)	
  
	
  	
  AND	
  good.event_id	
  IN	
  (select	
  search_event_id	
  FROM	
  s.install_events),	
  
GROUP	
  BY	
  bad.query,	
  good.query;
SELECT	
  bad.query,	
  good.query,	
  count(*)	
  as	
  cnt	
  
FROM	
  sessions	
  s	
  
JOIN	
  (SELECT	
  *	
  FROM	
  s.search_events)	
  bad,	
  
JOIN	
  (SELECT	
  *	
  FROM	
  s.search_events)	
  good,	
  
LEFT	
  ANTI	
  JOIN	
  (SELECT	
  search_event_id	
  FROM	
  s.install_events)	
  v1	
  
ON	
  (bad.event_id	
  =	
  v1.install_events)	
  
LEFT	
  SEMI	
  JOIN	
  (SELECT	
  search_event_id	
  FROM	
  s.install_events)	
  v2	
  
ON	
  (good.event_id	
  =	
  v2.search_event_id)	
  
WHERE	
  bad.tstamp_sec	
  <	
  good.tstamp_sec	
  
	
  	
  AND	
  good.tstamp_sec	
  -­‐	
  bad.tstamp_sec	
  <	
  30	
  
GROUP	
  BY	
  bad.query,	
  good.query;
Rewrite
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Nested	
  Types	
  in	
  Action
Scan	
  s
Subplan
Cross	
  Join	
  
bad.ts	
  <	
  good.ts	
  AND	
  
good.ts	
  –	
  bad.ts	
  <	
  30
Unnest	
  
s.search_events	
  good
Left	
  Anti	
  Join	
  
bad.event_id	
  =	
  
v1.search_event_id
Unnest	
  
s.install_events	
  v1
Left	
  Semi	
  Join	
  
good.event_id	
  =	
  
v2.search_event_id
Unnest	
  
s.install_events	
  v2
Aggregate	
  
count(*)	
  group	
  by	
  
bad.query,	
  good.query
Unnest	
  
s.search_events	
  good
SELECT	
  bad.query,	
  good.query,	
  count(*)	
  as	
  cnt	
  
FROM	
  sessions	
  s	
  
JOIN	
  (SELECT	
  *	
  FROM	
  s.search_events)	
  bad,	
  
JOIN	
  (SELECT	
  *	
  FROM	
  s.search_events)	
  good,	
  
LEFT	
  ANTI	
  JOIN	
  (SELECT	
  search_event_id	
  FROM	
  s.install_events)	
  v1	
  
ON	
  (bad.event_id	
  =	
  v1.install_events)	
  
LEFT	
  SEMI	
  JOIN	
  (SELECT	
  search_event_id	
  FROM	
  s.install_events)	
  v2	
  
ON	
  (good.event_id	
  =	
  v2.search_event_id)	
  
WHERE	
  bad.tstamp_sec	
  <	
  good.tstamp_sec	
  
	
  	
  AND	
  good.tstamp_sec	
  -­‐	
  bad.tstamp_sec	
  <	
  30	
  
GROUP	
  BY	
  bad.query,	
  good.query;

Contenu connexe

Tendances

Operations, Consistency, Failover for Multi-DC Clusters (Alexander Dejanovski...
Operations, Consistency, Failover for Multi-DC Clusters (Alexander Dejanovski...Operations, Consistency, Failover for Multi-DC Clusters (Alexander Dejanovski...
Operations, Consistency, Failover for Multi-DC Clusters (Alexander Dejanovski...
DataStax
 
独立系研究者としての生き方
独立系研究者としての生き方独立系研究者としての生き方
独立系研究者としての生き方
博士のシェアハウス
 

Tendances (20)

最近のストリーム処理事情振り返り
最近のストリーム処理事情振り返り最近のストリーム処理事情振り返り
最近のストリーム処理事情振り返り
 
Intro to Pinot (2016-01-04)
Intro to Pinot (2016-01-04)Intro to Pinot (2016-01-04)
Intro to Pinot (2016-01-04)
 
Operations, Consistency, Failover for Multi-DC Clusters (Alexander Dejanovski...
Operations, Consistency, Failover for Multi-DC Clusters (Alexander Dejanovski...Operations, Consistency, Failover for Multi-DC Clusters (Alexander Dejanovski...
Operations, Consistency, Failover for Multi-DC Clusters (Alexander Dejanovski...
 
Top 10 Mistakes When Migrating From Oracle to PostgreSQL
Top 10 Mistakes When Migrating From Oracle to PostgreSQLTop 10 Mistakes When Migrating From Oracle to PostgreSQL
Top 10 Mistakes When Migrating From Oracle to PostgreSQL
 
全部暗記でいいのに.pptx
全部暗記でいいのに.pptx全部暗記でいいのに.pptx
全部暗記でいいのに.pptx
 
게임을 위한 DynamoDB 사례 및 팁 - 김일호 솔루션즈 아키텍트:: AWS Cloud Track 3 Gaming
게임을 위한 DynamoDB 사례 및 팁 - 김일호 솔루션즈 아키텍트:: AWS Cloud Track 3 Gaming게임을 위한 DynamoDB 사례 및 팁 - 김일호 솔루션즈 아키텍트:: AWS Cloud Track 3 Gaming
게임을 위한 DynamoDB 사례 및 팁 - 김일호 솔루션즈 아키텍트:: AWS Cloud Track 3 Gaming
 
PostgreSQLアーキテクチャ入門
PostgreSQLアーキテクチャ入門PostgreSQLアーキテクチャ入門
PostgreSQLアーキテクチャ入門
 
大森ゼミ新歓
大森ゼミ新歓大森ゼミ新歓
大森ゼミ新歓
 
Java EE 8新機能解説 -Bean Validation 2.0編-
Java EE 8新機能解説 -Bean Validation 2.0編-Java EE 8新機能解説 -Bean Validation 2.0編-
Java EE 8新機能解説 -Bean Validation 2.0編-
 
Top 5 Mistakes When Writing Spark Applications
Top 5 Mistakes When Writing Spark ApplicationsTop 5 Mistakes When Writing Spark Applications
Top 5 Mistakes When Writing Spark Applications
 
爆速クエリエンジン”Presto”を使いたくなる話
爆速クエリエンジン”Presto”を使いたくなる話爆速クエリエンジン”Presto”を使いたくなる話
爆速クエリエンジン”Presto”を使いたくなる話
 
給開發人員的資料庫效能建議
給開發人員的資料庫效能建議給開發人員的資料庫效能建議
給開發人員的資料庫效能建議
 
Hadoop -ResourceManager HAの仕組み-
Hadoop -ResourceManager HAの仕組み-Hadoop -ResourceManager HAの仕組み-
Hadoop -ResourceManager HAの仕組み-
 
情報検索における質問者の プライバシー保護 :Private Information Retrieval
情報検索における質問者のプライバシー保護 :Private Information Retrieval情報検索における質問者のプライバシー保護 :Private Information Retrieval
情報検索における質問者の プライバシー保護 :Private Information Retrieval
 
StanとRでベイズ統計モデリング読書会 Chapter 7(7.6-7.9) 回帰分析の悩みどころ ~統計の力で歌うまになりたい~
StanとRでベイズ統計モデリング読書会 Chapter 7(7.6-7.9) 回帰分析の悩みどころ ~統計の力で歌うまになりたい~StanとRでベイズ統計モデリング読書会 Chapter 7(7.6-7.9) 回帰分析の悩みどころ ~統計の力で歌うまになりたい~
StanとRでベイズ統計モデリング読書会 Chapter 7(7.6-7.9) 回帰分析の悩みどころ ~統計の力で歌うまになりたい~
 
データモデリング入門2021
データモデリング入門2021データモデリング入門2021
データモデリング入門2021
 
Deletes Without Tombstones or TTLs (Eric Stevens, ProtectWise) | Cassandra Su...
Deletes Without Tombstones or TTLs (Eric Stevens, ProtectWise) | Cassandra Su...Deletes Without Tombstones or TTLs (Eric Stevens, ProtectWise) | Cassandra Su...
Deletes Without Tombstones or TTLs (Eric Stevens, ProtectWise) | Cassandra Su...
 
RStanとShinyStanによるベイズ統計モデリング入門
RStanとShinyStanによるベイズ統計モデリング入門RStanとShinyStanによるベイズ統計モデリング入門
RStanとShinyStanによるベイズ統計モデリング入門
 
分散処理基盤Apache Hadoop入門とHadoopエコシステムの最新技術動向 (オープンソースカンファレンス 2015 Tokyo/Spring 講...
分散処理基盤Apache Hadoop入門とHadoopエコシステムの最新技術動向 (オープンソースカンファレンス 2015 Tokyo/Spring 講...分散処理基盤Apache Hadoop入門とHadoopエコシステムの最新技術動向 (オープンソースカンファレンス 2015 Tokyo/Spring 講...
分散処理基盤Apache Hadoop入門とHadoopエコシステムの最新技術動向 (オープンソースカンファレンス 2015 Tokyo/Spring 講...
 
独立系研究者としての生き方
独立系研究者としての生き方独立系研究者としての生き方
独立系研究者としての生き方
 

Similaire à Nested Types in Impala

Friction-free ETL: Automating data transformation with Impala | Strata + Hado...
Friction-free ETL: Automating data transformation with Impala | Strata + Hado...Friction-free ETL: Automating data transformation with Impala | Strata + Hado...
Friction-free ETL: Automating data transformation with Impala | Strata + Hado...
Cloudera, Inc.
 

Similaire à Nested Types in Impala (20)

Marcel Kornacker, Software Enginner at Cloudera - "Data modeling for data sci...
Marcel Kornacker, Software Enginner at Cloudera - "Data modeling for data sci...Marcel Kornacker, Software Enginner at Cloudera - "Data modeling for data sci...
Marcel Kornacker, Software Enginner at Cloudera - "Data modeling for data sci...
 
Data Modeling for Data Science: Simplify Your Workload with Complex Types in ...
Data Modeling for Data Science: Simplify Your Workload with Complex Types in ...Data Modeling for Data Science: Simplify Your Workload with Complex Types in ...
Data Modeling for Data Science: Simplify Your Workload with Complex Types in ...
 
U-SQL - Azure Data Lake Analytics for Developers
U-SQL - Azure Data Lake Analytics for DevelopersU-SQL - Azure Data Lake Analytics for Developers
U-SQL - Azure Data Lake Analytics for Developers
 
Friction-free ETL: Automating data transformation with Impala | Strata + Hado...
Friction-free ETL: Automating data transformation with Impala | Strata + Hado...Friction-free ETL: Automating data transformation with Impala | Strata + Hado...
Friction-free ETL: Automating data transformation with Impala | Strata + Hado...
 
Cassandra Basics, Counters and Time Series Modeling
Cassandra Basics, Counters and Time Series ModelingCassandra Basics, Counters and Time Series Modeling
Cassandra Basics, Counters and Time Series Modeling
 
ADL/U-SQL Introduction (SQLBits 2016)
ADL/U-SQL Introduction (SQLBits 2016)ADL/U-SQL Introduction (SQLBits 2016)
ADL/U-SQL Introduction (SQLBits 2016)
 
AWS Summit Seoul 2015 - AWS 최신 서비스 살펴보기 - Aurora, Lambda, EFS, Machine Learn...
AWS Summit Seoul 2015 -  AWS 최신 서비스 살펴보기 - Aurora, Lambda, EFS, Machine Learn...AWS Summit Seoul 2015 -  AWS 최신 서비스 살펴보기 - Aurora, Lambda, EFS, Machine Learn...
AWS Summit Seoul 2015 - AWS 최신 서비스 살펴보기 - Aurora, Lambda, EFS, Machine Learn...
 
Old code doesn't stink
Old code doesn't stinkOld code doesn't stink
Old code doesn't stink
 
Beg sql
Beg sqlBeg sql
Beg sql
 
Beg sql
Beg sqlBeg sql
Beg sql
 
3 CityNetConf - sql+c#=u-sql
3 CityNetConf - sql+c#=u-sql3 CityNetConf - sql+c#=u-sql
3 CityNetConf - sql+c#=u-sql
 
Sql Server - Apresentação
Sql Server - ApresentaçãoSql Server - Apresentação
Sql Server - Apresentação
 
Dbms lab Manual
Dbms lab ManualDbms lab Manual
Dbms lab Manual
 
Dublin Ireland Spark Meetup October 15, 2015
Dublin Ireland Spark Meetup October 15, 2015Dublin Ireland Spark Meetup October 15, 2015
Dublin Ireland Spark Meetup October 15, 2015
 
Sql server T-sql basics ppt-3
Sql server T-sql basics  ppt-3Sql server T-sql basics  ppt-3
Sql server T-sql basics ppt-3
 
Building a Real-time Streaming ETL Framework Using ksqlDB and NoSQL
Building a Real-time Streaming ETL Framework Using ksqlDB and NoSQLBuilding a Real-time Streaming ETL Framework Using ksqlDB and NoSQL
Building a Real-time Streaming ETL Framework Using ksqlDB and NoSQL
 
Tricks every ClickHouse designer should know, by Robert Hodges, Altinity CEO
Tricks every ClickHouse designer should know, by Robert Hodges, Altinity CEOTricks every ClickHouse designer should know, by Robert Hodges, Altinity CEO
Tricks every ClickHouse designer should know, by Robert Hodges, Altinity CEO
 
Event streaming webinar feb 2020
Event streaming webinar feb 2020Event streaming webinar feb 2020
Event streaming webinar feb 2020
 
Anything SQL: Lightning Talks
Anything SQL: Lightning TalksAnything SQL: Lightning Talks
Anything SQL: Lightning Talks
 
DN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project A
DN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project ADN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project A
DN 2017 | Reducing pain in data engineering | Martin Loetzsch | Project A
 

Plus de Cloudera, Inc.

Plus de Cloudera, Inc. (20)

Partner Briefing_January 25 (FINAL).pptx
Partner Briefing_January 25 (FINAL).pptxPartner Briefing_January 25 (FINAL).pptx
Partner Briefing_January 25 (FINAL).pptx
 
Cloudera Data Impact Awards 2021 - Finalists
Cloudera Data Impact Awards 2021 - Finalists Cloudera Data Impact Awards 2021 - Finalists
Cloudera Data Impact Awards 2021 - Finalists
 
2020 Cloudera Data Impact Awards Finalists
2020 Cloudera Data Impact Awards Finalists2020 Cloudera Data Impact Awards Finalists
2020 Cloudera Data Impact Awards Finalists
 
Edc event vienna presentation 1 oct 2019
Edc event vienna presentation 1 oct 2019Edc event vienna presentation 1 oct 2019
Edc event vienna presentation 1 oct 2019
 
Machine Learning with Limited Labeled Data 4/3/19
Machine Learning with Limited Labeled Data 4/3/19Machine Learning with Limited Labeled Data 4/3/19
Machine Learning with Limited Labeled Data 4/3/19
 
Data Driven With the Cloudera Modern Data Warehouse 3.19.19
Data Driven With the Cloudera Modern Data Warehouse 3.19.19Data Driven With the Cloudera Modern Data Warehouse 3.19.19
Data Driven With the Cloudera Modern Data Warehouse 3.19.19
 
Introducing Cloudera DataFlow (CDF) 2.13.19
Introducing Cloudera DataFlow (CDF) 2.13.19Introducing Cloudera DataFlow (CDF) 2.13.19
Introducing Cloudera DataFlow (CDF) 2.13.19
 
Introducing Cloudera Data Science Workbench for HDP 2.12.19
Introducing Cloudera Data Science Workbench for HDP 2.12.19Introducing Cloudera Data Science Workbench for HDP 2.12.19
Introducing Cloudera Data Science Workbench for HDP 2.12.19
 
Shortening the Sales Cycle with a Modern Data Warehouse 1.30.19
Shortening the Sales Cycle with a Modern Data Warehouse 1.30.19Shortening the Sales Cycle with a Modern Data Warehouse 1.30.19
Shortening the Sales Cycle with a Modern Data Warehouse 1.30.19
 
Leveraging the cloud for analytics and machine learning 1.29.19
Leveraging the cloud for analytics and machine learning 1.29.19Leveraging the cloud for analytics and machine learning 1.29.19
Leveraging the cloud for analytics and machine learning 1.29.19
 
Modernizing the Legacy Data Warehouse – What, Why, and How 1.23.19
Modernizing the Legacy Data Warehouse – What, Why, and How 1.23.19Modernizing the Legacy Data Warehouse – What, Why, and How 1.23.19
Modernizing the Legacy Data Warehouse – What, Why, and How 1.23.19
 
Leveraging the Cloud for Big Data Analytics 12.11.18
Leveraging the Cloud for Big Data Analytics 12.11.18Leveraging the Cloud for Big Data Analytics 12.11.18
Leveraging the Cloud for Big Data Analytics 12.11.18
 
Modern Data Warehouse Fundamentals Part 3
Modern Data Warehouse Fundamentals Part 3Modern Data Warehouse Fundamentals Part 3
Modern Data Warehouse Fundamentals Part 3
 
Modern Data Warehouse Fundamentals Part 2
Modern Data Warehouse Fundamentals Part 2Modern Data Warehouse Fundamentals Part 2
Modern Data Warehouse Fundamentals Part 2
 
Modern Data Warehouse Fundamentals Part 1
Modern Data Warehouse Fundamentals Part 1Modern Data Warehouse Fundamentals Part 1
Modern Data Warehouse Fundamentals Part 1
 
Extending Cloudera SDX beyond the Platform
Extending Cloudera SDX beyond the PlatformExtending Cloudera SDX beyond the Platform
Extending Cloudera SDX beyond the Platform
 
Federated Learning: ML with Privacy on the Edge 11.15.18
Federated Learning: ML with Privacy on the Edge 11.15.18Federated Learning: ML with Privacy on the Edge 11.15.18
Federated Learning: ML with Privacy on the Edge 11.15.18
 
Analyst Webinar: Doing a 180 on Customer 360
Analyst Webinar: Doing a 180 on Customer 360Analyst Webinar: Doing a 180 on Customer 360
Analyst Webinar: Doing a 180 on Customer 360
 
Build a modern platform for anti-money laundering 9.19.18
Build a modern platform for anti-money laundering 9.19.18Build a modern platform for anti-money laundering 9.19.18
Build a modern platform for anti-money laundering 9.19.18
 
Introducing the data science sandbox as a service 8.30.18
Introducing the data science sandbox as a service 8.30.18Introducing the data science sandbox as a service 8.30.18
Introducing the data science sandbox as a service 8.30.18
 

Dernier

CHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
CHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICECHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
CHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
9953056974 Low Rate Call Girls In Saket, Delhi NCR
 
The title is not connected to what is inside
The title is not connected to what is insideThe title is not connected to what is inside
The title is not connected to what is inside
shinachiaurasa2
 
%+27788225528 love spells in Colorado Springs Psychic Readings, Attraction sp...
%+27788225528 love spells in Colorado Springs Psychic Readings, Attraction sp...%+27788225528 love spells in Colorado Springs Psychic Readings, Attraction sp...
%+27788225528 love spells in Colorado Springs Psychic Readings, Attraction sp...
masabamasaba
 
%+27788225528 love spells in Atlanta Psychic Readings, Attraction spells,Brin...
%+27788225528 love spells in Atlanta Psychic Readings, Attraction spells,Brin...%+27788225528 love spells in Atlanta Psychic Readings, Attraction spells,Brin...
%+27788225528 love spells in Atlanta Psychic Readings, Attraction spells,Brin...
masabamasaba
 

Dernier (20)

Software Quality Assurance Interview Questions
Software Quality Assurance Interview QuestionsSoftware Quality Assurance Interview Questions
Software Quality Assurance Interview Questions
 
Architecture decision records - How not to get lost in the past
Architecture decision records - How not to get lost in the pastArchitecture decision records - How not to get lost in the past
Architecture decision records - How not to get lost in the past
 
8257 interfacing 2 in microprocessor for btech students
8257 interfacing 2 in microprocessor for btech students8257 interfacing 2 in microprocessor for btech students
8257 interfacing 2 in microprocessor for btech students
 
CHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
CHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICECHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
CHEAP Call Girls in Pushp Vihar (-DELHI )🔝 9953056974🔝(=)/CALL GIRLS SERVICE
 
W01_panagenda_Navigating-the-Future-with-The-Hitchhikers-Guide-to-Notes-and-D...
W01_panagenda_Navigating-the-Future-with-The-Hitchhikers-Guide-to-Notes-and-D...W01_panagenda_Navigating-the-Future-with-The-Hitchhikers-Guide-to-Notes-and-D...
W01_panagenda_Navigating-the-Future-with-The-Hitchhikers-Guide-to-Notes-and-D...
 
%in kempton park+277-882-255-28 abortion pills for sale in kempton park
%in kempton park+277-882-255-28 abortion pills for sale in kempton park %in kempton park+277-882-255-28 abortion pills for sale in kempton park
%in kempton park+277-882-255-28 abortion pills for sale in kempton park
 
Announcing Codolex 2.0 from GDK Software
Announcing Codolex 2.0 from GDK SoftwareAnnouncing Codolex 2.0 from GDK Software
Announcing Codolex 2.0 from GDK Software
 
The title is not connected to what is inside
The title is not connected to what is insideThe title is not connected to what is inside
The title is not connected to what is inside
 
Chinsurah Escorts ☎️8617697112 Starting From 5K to 15K High Profile Escorts ...
Chinsurah Escorts ☎️8617697112  Starting From 5K to 15K High Profile Escorts ...Chinsurah Escorts ☎️8617697112  Starting From 5K to 15K High Profile Escorts ...
Chinsurah Escorts ☎️8617697112 Starting From 5K to 15K High Profile Escorts ...
 
SHRMPro HRMS Software Solutions Presentation
SHRMPro HRMS Software Solutions PresentationSHRMPro HRMS Software Solutions Presentation
SHRMPro HRMS Software Solutions Presentation
 
%+27788225528 love spells in Vancouver Psychic Readings, Attraction spells,Br...
%+27788225528 love spells in Vancouver Psychic Readings, Attraction spells,Br...%+27788225528 love spells in Vancouver Psychic Readings, Attraction spells,Br...
%+27788225528 love spells in Vancouver Psychic Readings, Attraction spells,Br...
 
Exploring the Best Video Editing App.pdf
Exploring the Best Video Editing App.pdfExploring the Best Video Editing App.pdf
Exploring the Best Video Editing App.pdf
 
The Top App Development Trends Shaping the Industry in 2024-25 .pdf
The Top App Development Trends Shaping the Industry in 2024-25 .pdfThe Top App Development Trends Shaping the Industry in 2024-25 .pdf
The Top App Development Trends Shaping the Industry in 2024-25 .pdf
 
%in Stilfontein+277-882-255-28 abortion pills for sale in Stilfontein
%in Stilfontein+277-882-255-28 abortion pills for sale in Stilfontein%in Stilfontein+277-882-255-28 abortion pills for sale in Stilfontein
%in Stilfontein+277-882-255-28 abortion pills for sale in Stilfontein
 
Introducing Microsoft’s new Enterprise Work Management (EWM) Solution
Introducing Microsoft’s new Enterprise Work Management (EWM) SolutionIntroducing Microsoft’s new Enterprise Work Management (EWM) Solution
Introducing Microsoft’s new Enterprise Work Management (EWM) Solution
 
%+27788225528 love spells in Colorado Springs Psychic Readings, Attraction sp...
%+27788225528 love spells in Colorado Springs Psychic Readings, Attraction sp...%+27788225528 love spells in Colorado Springs Psychic Readings, Attraction sp...
%+27788225528 love spells in Colorado Springs Psychic Readings, Attraction sp...
 
%in Midrand+277-882-255-28 abortion pills for sale in midrand
%in Midrand+277-882-255-28 abortion pills for sale in midrand%in Midrand+277-882-255-28 abortion pills for sale in midrand
%in Midrand+277-882-255-28 abortion pills for sale in midrand
 
%+27788225528 love spells in Atlanta Psychic Readings, Attraction spells,Brin...
%+27788225528 love spells in Atlanta Psychic Readings, Attraction spells,Brin...%+27788225528 love spells in Atlanta Psychic Readings, Attraction spells,Brin...
%+27788225528 love spells in Atlanta Psychic Readings, Attraction spells,Brin...
 
10 Trends Likely to Shape Enterprise Technology in 2024
10 Trends Likely to Shape Enterprise Technology in 202410 Trends Likely to Shape Enterprise Technology in 2024
10 Trends Likely to Shape Enterprise Technology in 2024
 
call girls in Vaishali (Ghaziabad) 🔝 >༒8448380779 🔝 genuine Escort Service 🔝✔️✔️
call girls in Vaishali (Ghaziabad) 🔝 >༒8448380779 🔝 genuine Escort Service 🔝✔️✔️call girls in Vaishali (Ghaziabad) 🔝 >༒8448380779 🔝 genuine Escort Service 🔝✔️✔️
call girls in Vaishali (Ghaziabad) 🔝 >༒8448380779 🔝 genuine Escort Service 🔝✔️✔️
 

Nested Types in Impala

  • 1. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Nested  Types  in  Impala Alex  Behm  //  Cloudera,  Inc.   Marcel  Kornacker  //  Cloudera,  Inc.   Skye  Wanderman-­‐Milne  //  Cloudera,  Inc. Impala  Meetup  03/24/2015
  • 2. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Design  Goals • Goals   • Support  for  nested  data  types:  struct,  map,  array   • Full  expressiveness  of  SQL  with  nested  structures   • v1  Prioritization   • Focus  on  SELECT  queries  (INSERT  in  later  releases)   • Focus  on  native  Parquet  and  Avro  formats  (XML,  JSON,  etc  in  later  releases)   • Focus  on  built-­‐in  language  expressiveness  (UDTF  extensibility  in  later  releases)
  • 3. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Example  Schema CREATE TABLE Customers { id BIGINT, address STRUCT< city: STRING, zip: INT > orders ARRAY<STRUCT< txn_time: TIMESTAMP, cc: BIGINT, items: ARRAY<STRUCT< item_no: STRING, price: DECIMAL(9,2) >> >> preferred_cc BIGINT }
  • 4. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Syntax  Extensions • Path  expressions  extend  column  references  to  scalars  (nested  structs)   • Can  appear  anywhere  a  conventional  column  reference  is  used   • Collections  (maps  and  arrays)  are  exposed  like  sub-­‐tables   • Use  FROM  clause  to  specify  which  collections  to  read  like  conventional  tables   • Can  use  JOIN  conditions  to  express  join  relationship  (default  is  INNER  JOIN) Find  the  ids  and  city  of  customers  who  live  in  the  zip  code  94305:   SELECT  id,  address.city  FROM  customers  WHERE  address.zip  =  94305 Find  all  orders  that  were  paid  for  with  a  customer’s  preferred  credit  card:   SELECT  o.txn_id  FROM  customers  c,  c.orders  o  WHERE  o.cc  =  c.preferred_cc    
  • 5. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Referencing  Arrays  &  Maps • Basic  idea:  Flatten  nested  collections  referenced  in  the  FROM  clause   • Can  be  thought  of  as  an  implicit  join  on  the  parent/child  relationship SELECT  c.id,  o.txn_id  FROM  customers  c,  c.orders  o c.id o.txn_id 100 203 100 305 100 507 … … 101 10056 101 10 … … id  of  a  customer  repeated   for  every  order
  • 6. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Referencing  Arrays  &  Maps SELECT  c.id,  o.txn_id  FROM  customers  c,  c.orders  o   SELECT  c.id,  o.txn_id  FROM  customer  c  INNER  JOIN  c.orders  o Returns  customer/order  data  for  customers  that  have  at  least  one  order SELECT  c.id,  o.txn_id  FROM  customers  c  LEFT  OUTER  JOIN  c.orders  o Also  returns  customers  with  no  orders  (with  order  fields  NULL) SELECT  c.id,  o.txn_id  FROM  customers  c  LEFT  ANTI  JOIN  c.orders  o Find  all  customers  that  have  no  orders
  • 7. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Motivation  for  Advanced  Querying  Capabilities • Count  the  number  of  orders  per  customer   • Count  the  number  of  items  per  order   • Impractical  à Requires  unique  id  at  every  nesting  level   • Information  is  already  expressed  in  nesting  relationship!   • What  about  even  more  interesting  queries?   • Get  the  number  of  orders  and  the  average  item  price  per  customer?   • “Group  by”  multiple  nesting  levels SELECT  COUNT(*)  FROM  customers  c,  c.orders  o  GROUP  BY  c.id SELECT  COUNT(*)  FROM  customers.orders  o,  o.items  GROUP  BY  ??? Must  be  unique
  • 8. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Advanced  Querying:  Correlated  Table  References • Count  the  number  of  orders  per  customer   • Count  the  number  of  items  per  order   • Get  the  number  of  orders  and  the  average  item  price  per  customer SELECT  cnt  FROM  customers  c,  (SELECT  COUNT(*)  cnt  FROM  c.orders)  v SELECT  cnt  FROM  customers.orders  o,  (SELECT  COUNT(*)  cnt  FROM  o.items)  v SELECT  c.id,  cnt,  avgp   FROM  customer  c,      (SELECT  count(1)  cnt  FROM  c.orders)  v1,      (SELECT  avg(price)  avpg  FROM  c.orders.items)  v2 Correlated  reference  to  “c”  
  • 9. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Advanced  Querying:  Correlated  Table  References • Full  expressibility   • Arbitrary  SQL  allowed  in  inline  views  and  subqueries  with  correlated  table  refs   • Correlated  subqueries  transformed  into  joins  if  possible   • Exploits  nesting  relationship   • No  need  for  stored  unique  ids  at  various  levels   • Similar  to  standard  SQL  ‘LATERAL’,  ‘CROSS  APPLY’,  ‘OUTER  CROSS  APPLY’   • Goes  beyond  standard  in  some  aspects  (semi/anti  variants)   • Not  as  general  as  SQL  standard  (limited  correlations) SELECT  id  FROM  customers  c  WHERE  EXISTS      (SELECT  1  FROM  c.orders  o,  o.items  i  where  o.cc  =  c.preferred_cc  and  i.price  >  100)
  • 10. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Execution  Model Design  sweet  spot:  small/medium  sized  collections  (max  few  hundreds  of  MB)   • Built-­‐in  limitation  on  the  size  of  nested  collections  (TBD)   • Huge  collections  not  expected  to  be  performant  and  rare   Execution  Overview   • Scans  materialize  minimal  nested  structure  in  memory   • Three  new  exec  nodes   • Subplan:  Executes  its  subplan  tree  for  each  input  row  and  returns  the  rows   produced  by  the  subplan   • Nested  Row  Src:  Returns  the  current  input  row  of  its  parent  Subplan  node   • Unnest:  Scans  an  array  slot  of  the  current  input  row  of  its  parent  Subplan  node  or   of  an  output  row  from  its  child  plan  node,  returning  one  row  per  array  element.  
  • 11. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Example  A SELECT  count(*)  FROM  customer.orders.items Scan   orders.customer.items Aggregate   count(*) Count  the  total  number  of  items Exchange   (to  client)
  • 12. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Example  B SELECT  c.id,  cnt   FROM  customer  c,   JOIN  (SELECT  count(*)  cnt  FROM  c.orders)  v Scan  c Subplan materialize   c.orders Unnest   c.orders Aggregate   count(*) Subplan  Pseudo-­‐Code   result  =  {}   for  each  c  in  input      set  c  in  dependent  nodes      result  +=  rhsPlan.exec()   return  result Nested   Row  Src Cross  Join set  c  in  dependent  nodes Count  the  number  of  orders  per  customer Exchange   (to  client)
  • 13. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Example  C SELECT  c.id,  cnt,  avgp   FROM  customer  c,      (SELECT  count(*)  cnt  FROM  c.orders)  v1      (SELECT  avg(price)  avpg  FROM  c.orders.items)  v2 Scan  c Subplan Unnest   c.orders Aggregate   count(*) Unnest   c.orders.item Aggregate   avg(price) Cross  Join Return  the  number  of  orders  and   the  average  item  price  per  customer Nested  Row   Src Cross  Join Exchange   (to  client)
  • 14. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Example  D SELECT  c.id,  cnt,  avgp   FROM  customer  c,      (SELECT  count(*)  cnt  FROM  c.orders  o        WHERE  (SELECT  sum(price)  FROM  o.items)  >  100))  v Scan  c Subplan Unnest   c.orders Aggregate   count(*) Unnest   o.items Aggregate   sum(price)   sum(price)>100 Cross  Join For  each  customer,  return  the  number  of  orders   whose  total  item  price  exceeds  >  100 Subplan Nested  Row   Src:  c Nested  Row   Src:  o Cross  Join Exchange   (to  client)
  • 15. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Future  Work • Syntax  extensions   • Performance  improvements   • Parquet:  Scan  columns  directly  in  “unnest”,  avoid  collection  materialization   • Codegen  subplans   • INSERT  queries,  e.g.,  convert  flat  data  into  nested  data   • UDTF  support   • More  formats:  JSON,  XML,  etc. SELECT   c.id,         count(c.orders),       avg(c.orders.items.price)   FROM  customer  c SELECT  c.id,  cnt,  avgp   FROM  customer  c,      (SELECT  count(1)  cnt  FROM  c.orders)  v1      (SELECT  avg(price)  avpg  FROM  c.orders.items)  v2 Rewrite
  • 16. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Thank  you
  • 17. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Appendix
  • 18. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Comparison  of  Impala  and  HiveQL • Impala’s  syntax  provides  a  superset  of  Hive’s  functionality   • HiveQL  has  similar  path  expressions  but  with  restrictions   • Must  use  LATERAL  VIEW  in  FROM  clause;  more  verbose  syntax   • LATERAL  VIEWs  themselves  have  many  restrictions,  no  arbitrary  SQL   • Requires  complex  joins  or  unique  ids  at  various  nesting  levels  for  expressing  even  simple  queries  (e.g.,   find  number  of  orders  per  customer)   • Does  not  provide  similar  inline  view  and  subquery  capabilities
  • 19. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Syntax  vs.  Hive  Syntax SELECT  …  FROM  customer  c,  c.orders  o,  o.items  i SELECT  …  FROM  customer  c   LATERAL  VIEW  explode(c.orders)  as  (c1,  c2…)   LATERAL  VIEW  explode(c.orders.items)  as  (c1,  c2…) SELECT  …  FROM  customer  c   LEFT  JOIN  c.orders  LEFT  JOIN  c.orders.items SELECT  …  FROM  customer  c   OUTER  LATERAL  VIEW  explode(c.orders)  as  (c1,  …)   OUTER  LATERAL  VIEW  explode(c.orders.items)  as  (c1,  …) SELECT  c.id  FROM  customer  c,   LEFT  ANTI  JOIN  (SELECT  oid  FROM  c.orders)  v   ON  c.preferred_cc  =  v.cc SELECT  c.id  FROM  customer  c   WHERE  NOT  EXISTS  (SELECT  oid  FROM  c.orders  WHERE   c.preferred_cc  =  orders.cc) No  convenient/performant  equivalent  in  Hive.
  • 20. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Syntax  vs.  Hive  Syntax SELECT  …   FROM  customer  c   LATERAL  VIEW  MY_UDTF(c.orders)  as  (c1,  c2…) Impala  will  not  support  Hive’s  builtin  or  user-­‐defined   table  generating  functions  for  now. SELECT  …   FROM  customer  c   LATERAL  VIEW  json_tuple(c.json_str,  …)  as  (c1,  c2…) SELECT  count(c.orders)  FROM  customer  c SELECT  cnt  FROM  customer  c,   JOIN  (SELECT  count(1)  cnt  FROM  c.orders)  v1 SELECT  count(1)   FROM  customer  c   LATERAL  VIEW  explode(c.orders)  as  (c1,  c2…)   GROUP  BY  c.orders   (in  absence  of  a  unique  key  in  ‘customer’)
  • 21. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Syntax  vs.  Hive  Syntax SELECT   c.id,         count(c.orders),       avg(c.orders.items.price)   FROM  customer  c SELECT  c.id,  cnt,  avgp   FROM  customer  c,   JOIN  (SELECT  count(1)  cnt  FROM  c.orders)  v1   JOIN  (SELECT  avg(price)  avpg  FROM  c.orders.items)  v2 No  convenient/performant  equivalent  in  Hive. • Impala  is  more  expressive,  but  less  extensible  until  UDTFs  are  supported   • More  join  types:  inner/outer/semi/anti   • Full  SQL  block  inside  correlated  inline  view   • Hive  more  extensible  (UDTFs),  has  builtin  UDTFs   • Hive  Lateral  View  very  rigid,  no  arbitrary  SQL  inside
  • 22. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Nested  Types  in  Action Josh  Will’s  Blog  post  on  analyzing  misspelled  queries   http://blog.cloudera.com/blog/2014/08/how-­‐to-­‐count-­‐events-­‐like-­‐a-­‐data-­‐scientist/   • Goal:  Rudimentary  spell  checker  based  on  counting  query/click  events   • Problem:  Cross  referencing  items  in  multiple  nested  collections   • Representative  of  many  machine  learning  tasks  (Josh  tells  me)   • Goal  was  not  naturally  achievable  with  Hive     • Josh  implemented  a  custom  Hive  extension  “WITHIN”   • How  can  Impala  serve  this  use  case?
  • 23. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Nested  Types  in  Action account_id:  bigint   search_events:  array<struct<      event_id:  bigint      query:  string      tstamp_sec:  bigint      ...   >>   install_events:  array<struct<      event_id:  bigint      search_event_id:  bigint      app_id:  bigint      ...   >> SELECT  a.qw,  a.qr,  count(*)  as  cnt   FROM  sessions   LATERAL  VIEW  WITHIN(      "SELECT  bad.query  qw,  good.query  qr      FROM  t1  as  bad,  t1  as  good      WHERE  bad.tstamp_sec  <  good.tstamp_sec      AND  good.tstamp_sec  -­‐  bad.tstamp_sec  <  30      AND  bad.event_id  NOT  IN  (select  search_event_id  FROM  t2)      AND  good.event_id  IN  (select  search_event_id  FROM  t2)",   search_events,  install_events)  a   GROUP  BY  a.qw,  a.qr; SELECT  bad.query,  good.query,  count(*)  as  cnt   FROM  sessions  s,      s.search_events  bad,      s.search_events  good,   WHERE  bad.tstamp_sec  <  good.tstamp_sec      AND  good.tstamp_sec  -­‐  bad.tstamp_sec  <  30      AND  bad.event_id  NOT  IN  (select  search_event_id  FROM  s.install_events)      AND  good.event_id  IN  (select  search_event_id  FROM  s.install_events),   GROUP  BY  bad.query,  good.query; Josh’s  HiveQL  extension Impala  SQL Schema
  • 24. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Nested  Types  in  Action SELECT  bad.query,  good.query,  count(*)  as  cnt   FROM  sessions  s,      s.search_events  bad,      s.search_events  good,   WHERE  bad.tstamp_sec  <  good.tstamp_sec      AND  good.tstamp_sec  -­‐  bad.tstamp_sec  <  30      AND  bad.event_id  NOT  IN  (select  search_event_id  FROM  s.install_events)      AND  good.event_id  IN  (select  search_event_id  FROM  s.install_events),   GROUP  BY  bad.query,  good.query; SELECT  bad.query,  good.query,  count(*)  as  cnt   FROM  sessions  s   JOIN  (SELECT  *  FROM  s.search_events)  bad,   JOIN  (SELECT  *  FROM  s.search_events)  good,   LEFT  ANTI  JOIN  (SELECT  search_event_id  FROM  s.install_events)  v1   ON  (bad.event_id  =  v1.install_events)   LEFT  SEMI  JOIN  (SELECT  search_event_id  FROM  s.install_events)  v2   ON  (good.event_id  =  v2.search_event_id)   WHERE  bad.tstamp_sec  <  good.tstamp_sec      AND  good.tstamp_sec  -­‐  bad.tstamp_sec  <  30   GROUP  BY  bad.query,  good.query; Rewrite
  • 25. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Nested  Types  in  Action Scan  s Subplan Cross  Join   bad.ts  <  good.ts  AND   good.ts  –  bad.ts  <  30 Unnest   s.search_events  good Left  Anti  Join   bad.event_id  =   v1.search_event_id Unnest   s.install_events  v1 Left  Semi  Join   good.event_id  =   v2.search_event_id Unnest   s.install_events  v2 Aggregate   count(*)  group  by   bad.query,  good.query Unnest   s.search_events  good SELECT  bad.query,  good.query,  count(*)  as  cnt   FROM  sessions  s   JOIN  (SELECT  *  FROM  s.search_events)  bad,   JOIN  (SELECT  *  FROM  s.search_events)  good,   LEFT  ANTI  JOIN  (SELECT  search_event_id  FROM  s.install_events)  v1   ON  (bad.event_id  =  v1.install_events)   LEFT  SEMI  JOIN  (SELECT  search_event_id  FROM  s.install_events)  v2   ON  (good.event_id  =  v2.search_event_id)   WHERE  bad.tstamp_sec  <  good.tstamp_sec      AND  good.tstamp_sec  -­‐  bad.tstamp_sec  <  30   GROUP  BY  bad.query,  good.query;