SlideShare a Scribd company logo
1 of 9
Download to read offline
HackReduce
                  M a p R e d u c e   I n t r o




Hopper.com (Greg Lu)
Project

         github.com/hackreduce/Hackathon




                       Wiki

      github.com/hackreduce/Hackathon/wiki




Download the Github project for some sample datasets
datasets/nasdaq/daily_prices/NASDAQ_daily_prices_subset.csv



                                                                                                      }
NASDAQ,DELL,1997-08-26,83.87,84.75,82.50,82.81,48736000,10.35
NASDAQ,DITC,2002-10-24,1.56,1.69,1.53,1.60,133600,1.60
NASDAQ,DLIA,2008-01-28,1.91,2.31,1.91,2.23,760800,2.23                                                    InputSplit 1
NASDAQ,DWCH,2002-07-10,3.09,3.14,3.09,3.14,2400,1.57



                                                                                                      }
NASDAQ,DYNT,2008-12-29,0.31,0.31,0.29,0.30,26900,0.30
NASDAQ,DMLP,2003-10-21,17.65,17.94,17.58,17.59,4800,9.73
NASDAQ,DORM,1997-02-07,7.88,7.88,7.63,7.75,7400,3.87                                                      InputSplit 2
NASDAQ,DXPE,2004-10-25,5.19,5.24,5.00,5.00,7600,2.50



                                                                                                      }
NASDAQ,DEST,2009-03-17,4.55,5.03,4.55,5.03,6800,5.03
NASDAQ,DBRN,1992-01-02,8.88,9.25,8.75,8.88,84800,2.22
NASDAQ,DXYN,1998-11-25,6.38,6.44,6.19,6.25,211100,6.25                                                    InputSplit 3
NASDAQ,DEAR,1998-12-08,10.50,11.50,10.50,10.50,5800,6.45
...
 org.hackreduce.examples.stockexchange.MarketCapitalization (expanded version)
 public int run(String[] args) throws Exception {
     Configuration conf = getConf();

      if (args.length != 2) {
          System.err.println("Usage: " + getClass().getName() + " <input> <output>");
          System.exit(2);
      }

      // Creating the MapReduce job (configuration) object
      Job job = new Job(conf);
      job.setJarByClass(getClass());
      job.setJobName(getClass().getName());



                                                                                           }   Defines how the data is split
      // The Nasdaq/NYSE data dumps comes in as a CSV file (text input), so we configure
      // the job to use this format.
      job.setInputFormatClass(TextInputFormat.class);                                          and assigned to which mappers
      [...]
datasets/nasdaq/daily_prices/NASDAQ_daily_prices_subset.csv
datasets/nasdaq/daily_prices



                                                                                                 }
NASDAQ,DELL,1997-08-26,83.87,84.75,82.50,82.81,48736000,10.35
NASDAQ,DITC,2002-10-24,1.56,1.69,1.53,1.60,133600,1.60
NASDAQ,DLIA,2008-01-28,1.91,2.31,1.91,2.23,760800,2.23
                                                                                                      InputSplit 1
NASDAQ,DWCH,2002-07-10,3.09,3.14,3.09,3.14,2400,1.57




org.hackreduce.examples.stockexchange.MarketCapitalization (expanded version)
 public int run(String[] args) throws Exception {
     [...]

     // Tell the job which Mapper and Reducer to use (classes defined above)
     job.setMapperClass(MarketCapitalizationMapper.class);
     job.setReducerClass(MarketCapitalizationReducer.class);
                                                                               }   Point the job to the custom classes that
                                                                                   we created in order to process the data.



                                                                               }
     // This is what the Mapper will be outputting to the Reducer
     job.setMapOutputKeyClass(Text.class);
     job.setMapOutputValueClass(DoubleWritable.class);                             Define the types of the (key, value)
     // This is what the Reducer will be outputting                                pairs that we’ll be outputting from the
     job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(Text.class);
                                                                                   mappers and the result of the job itself.
     // Setting the input folder of the job
     FileInputFormat.addInputPath(job, new Path(args[0]));

     // Preparing the output folder by first deleting it if it exists
     Path output = new Path(args[1]);
     FileSystem.get(conf).delete(output, true);
     FileOutputFormat.setOutputPath(job, output);




                             Now we’ll show the MarketCapitalizationMapper class
datasets/nasdaq/daily_prices/NASDAQ_daily_prices_subset.csv
datasets/nasdaq/daily_prices



                                                                                                            }
NASDAQ,DELL,1997-08-26,83.87,84.75,82.50,82.81,48736000,10.35
NASDAQ,DITC,2002-10-24,1.56,1.69,1.53,1.60,133600,1.60
NASDAQ,DLIA,2008-01-28,1.91,2.31,1.91,2.23,760800,2.23
                                                                                                                   InputSplit 1
NASDAQ,DWCH,2002-07-10,3.09,3.14,3.09,3.14,2400,1.57



org.hackreduce.examples.stockexchange.MarketCapitalization (expanded version)
public static class MarketCapitalizationMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {

    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String inputString = value.toString();

        String[] attributes = inputString.split(",");

        if (attributes.length != 9)
            throw new IllegalArgumentException("Input string given did not have 9 values in CSV format");

        try {
            String exchange = attributes[0];
            String stockSymbol = attributes[1];
            Date date = sdf.parse(attributes[2]);
            double stockPriceOpen = Double.parseDouble(attributes[3]);
            double stockPriceHigh = Double.parseDouble(attributes[4]);
            double stockPriceLow = Double.parseDouble(attributes[5]);
            double stockPriceClose = Double.parseDouble(attributes[6]);
            int stockVolume = Integer.parseInt(attributes[7]);
            double stockPriceAdjClose = Double.parseDouble(attributes[8]);
        } catch (ParseException e) {
            throw new IllegalArgumentException("Input string contained an unknown value that couldn't be parsed");
        } catch (NumberFormatException e) {
            throw new IllegalArgumentException("Input string contained an unknown number value that couldn't be parsed");
        }

        double marketCap = stockPriceClose * stockVolume;
        context.write(new Text(stockSymbol), new DoubleWritable(marketCap));      }    This job doesn’t do a whole lot,
    }
                                                                                       but this is where the processing
}                                                                                      is occurring.
datasets/nasdaq/daily_prices/NASDAQ_daily_prices_subset.csv
datasets/nasdaq/daily_prices



                                                                 }
NASDAQ,DELL,1997-08-26,83.87,84.75,82.50,82.81,48736000,10.35
NASDAQ,DITC,2002-10-24,1.56,1.69,1.53,1.60,133600,1.60
NASDAQ,DLIA,2008-01-28,1.91,2.31,1.91,2.23,760800,2.23
                                                                      InputSplit 1
NASDAQ,DWCH,2002-07-10,3.09,3.14,3.09,3.14,2400,1.57


                                            (line-by-line)


                      MarketCapitalizationMapper
                                            (emits)

                               (DELL, 82.81*48736000)
                                 (DITC, 1.60*133600)
                                 (DLIA, 2.23*760800)
                                 (DWCH, 3.14*2400)

                                            (sorted and partitioned to specific reducers)


                     MarketCapitalizationReducer
(coming from different mappers)

                                                      (DELL, 82.81*48736000)
                                                      (DELL, 31.92*18678500)
                                                      (DELL, 23.85*16038700)
                                                      (DELL, 30.38*68759800)
                                                                     (...)

                                                                         (but arriving at the same reducer)

org.hackreduce.examples.stockexchange.MarketCapitalization (expanded version)

public static class MarketCapitalizationReducer extends Reducer<Text, DoubleWritable, Text, Text> {
    NumberFormat currencyFormat = NumberFormat.getCurrencyInstance(Locale.getDefault());

    @Override
    protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
        double highestCap = 0.0;
        for (DoubleWritable value : values) {
            highestCap = Math.max(highestCap, value.get());
        }
        context.write(key, new Text(currencyFormat.format(highestCap)));
    }
}



                                                                         (output of this reducer)

                                                   (DELL, $4,035,828,160.00)
/tmp/nasdaq_marketcaps/part-r-00000
   DAIO $1,515,345.00
DAKT $63,656,600.00
DANKY $89,668,857.00
DARA $1,464,720.00
DASTY $14,141,055.00
DATA $2,888,325.00
DAVE $5,144,800.00
DBLE $1,040,996.00
DBLEP $79,584.00
DBRN $131,023,326.00
DBTK $7,405,366.00
DCAI $20,058,990.00
DCGN $10,372,992.00
DCOM $12,298,208.00
DCTH $3,285,652.00
DDDC $79,176.00
DDIC $3,684,100.00
DDMX $7,811,204.00
DDRX $12,480,500.00
DDSS $4,545,438.00
DEAR $4,375,800.00
DECK $271,081,580.00
DEER $5,363,740.00
DEIX $5,285,892.00
We can dynamically increase your clusters if
you need the processing power, but it’s
typically bottlenecked by the code.

If your job takes longer than 10 minutes to
run, come see us.

More Related Content

What's hot

Business Intelligence Portfolio
Business Intelligence PortfolioBusiness Intelligence Portfolio
Business Intelligence PortfolioBob Litsinger
 
SICP_2.5 일반화된 연산시스템
SICP_2.5 일반화된 연산시스템SICP_2.5 일반화된 연산시스템
SICP_2.5 일반화된 연산시스템HyeonSeok Choi
 
OSDC.fr 2012 :: Cascalog : progammation logique pour Hadoop
OSDC.fr 2012 :: Cascalog : progammation logique pour HadoopOSDC.fr 2012 :: Cascalog : progammation logique pour Hadoop
OSDC.fr 2012 :: Cascalog : progammation logique pour HadoopPublicis Sapient Engineering
 
MongoDB 在盛大大数据量下的应用
MongoDB 在盛大大数据量下的应用MongoDB 在盛大大数据量下的应用
MongoDB 在盛大大数据量下的应用iammutex
 
POLITEKNIK MALAYSIA
POLITEKNIK MALAYSIAPOLITEKNIK MALAYSIA
POLITEKNIK MALAYSIAAiman Hud
 
CS101- Introduction to Computing- Lecture 26
CS101- Introduction to Computing- Lecture 26CS101- Introduction to Computing- Lecture 26
CS101- Introduction to Computing- Lecture 26Bilal Ahmed
 
JavaFX 2.0 With Alternative Languages - JavaOne 2011
JavaFX 2.0 With Alternative Languages - JavaOne 2011JavaFX 2.0 With Alternative Languages - JavaOne 2011
JavaFX 2.0 With Alternative Languages - JavaOne 2011Stephen Chin
 
groovy databases
groovy databasesgroovy databases
groovy databasesPaul King
 
ClickHouse Unleashed 2020: Our Favorite New Features for Your Analytical Appl...
ClickHouse Unleashed 2020: Our Favorite New Features for Your Analytical Appl...ClickHouse Unleashed 2020: Our Favorite New Features for Your Analytical Appl...
ClickHouse Unleashed 2020: Our Favorite New Features for Your Analytical Appl...Altinity Ltd
 
That’s My App - Running in Your Background - Draining Your Battery
That’s My App - Running in Your Background - Draining Your BatteryThat’s My App - Running in Your Background - Draining Your Battery
That’s My App - Running in Your Background - Draining Your BatteryMichael Galpin
 
SQLBits X SQL Server 2012 Spatial Indexing
SQLBits X SQL Server 2012 Spatial IndexingSQLBits X SQL Server 2012 Spatial Indexing
SQLBits X SQL Server 2012 Spatial IndexingMichael Rys
 
Introduction à dart
Introduction à dartIntroduction à dart
Introduction à dartyohanbeschi
 
Advanced Windows Debugging
Advanced Windows DebuggingAdvanced Windows Debugging
Advanced Windows DebuggingBala Subra
 

What's hot (20)

Business Intelligence Portfolio
Business Intelligence PortfolioBusiness Intelligence Portfolio
Business Intelligence Portfolio
 
SICP_2.5 일반화된 연산시스템
SICP_2.5 일반화된 연산시스템SICP_2.5 일반화된 연산시스템
SICP_2.5 일반화된 연산시스템
 
Sql
SqlSql
Sql
 
OSDC.fr 2012 :: Cascalog : progammation logique pour Hadoop
OSDC.fr 2012 :: Cascalog : progammation logique pour HadoopOSDC.fr 2012 :: Cascalog : progammation logique pour Hadoop
OSDC.fr 2012 :: Cascalog : progammation logique pour Hadoop
 
MongoDB 在盛大大数据量下的应用
MongoDB 在盛大大数据量下的应用MongoDB 在盛大大数据量下的应用
MongoDB 在盛大大数据量下的应用
 
Database security
Database securityDatabase security
Database security
 
Codemash-Clojure.pdf
Codemash-Clojure.pdfCodemash-Clojure.pdf
Codemash-Clojure.pdf
 
COLLADA & WebGL
COLLADA & WebGLCOLLADA & WebGL
COLLADA & WebGL
 
POLITEKNIK MALAYSIA
POLITEKNIK MALAYSIAPOLITEKNIK MALAYSIA
POLITEKNIK MALAYSIA
 
CS101- Introduction to Computing- Lecture 26
CS101- Introduction to Computing- Lecture 26CS101- Introduction to Computing- Lecture 26
CS101- Introduction to Computing- Lecture 26
 
JavaFX 2.0 With Alternative Languages - JavaOne 2011
JavaFX 2.0 With Alternative Languages - JavaOne 2011JavaFX 2.0 With Alternative Languages - JavaOne 2011
JavaFX 2.0 With Alternative Languages - JavaOne 2011
 
groovy databases
groovy databasesgroovy databases
groovy databases
 
ClickHouse Unleashed 2020: Our Favorite New Features for Your Analytical Appl...
ClickHouse Unleashed 2020: Our Favorite New Features for Your Analytical Appl...ClickHouse Unleashed 2020: Our Favorite New Features for Your Analytical Appl...
ClickHouse Unleashed 2020: Our Favorite New Features for Your Analytical Appl...
 
SQL introduction
SQL introductionSQL introduction
SQL introduction
 
That’s My App - Running in Your Background - Draining Your Battery
That’s My App - Running in Your Background - Draining Your BatteryThat’s My App - Running in Your Background - Draining Your Battery
That’s My App - Running in Your Background - Draining Your Battery
 
SQLBits X SQL Server 2012 Spatial Indexing
SQLBits X SQL Server 2012 Spatial IndexingSQLBits X SQL Server 2012 Spatial Indexing
SQLBits X SQL Server 2012 Spatial Indexing
 
Introduction à dart
Introduction à dartIntroduction à dart
Introduction à dart
 
ROracle
ROracle ROracle
ROracle
 
Advanced Windows Debugging
Advanced Windows DebuggingAdvanced Windows Debugging
Advanced Windows Debugging
 
Ch4
Ch4Ch4
Ch4
 

Viewers also liked

Programação 55° JASC - Alterada (7-12-2015)
Programação 55° JASC - Alterada (7-12-2015)Programação 55° JASC - Alterada (7-12-2015)
Programação 55° JASC - Alterada (7-12-2015)esportealtovale
 
Eveline Sillevis Smitt - Omgevingswet
Eveline Sillevis Smitt - OmgevingswetEveline Sillevis Smitt - Omgevingswet
Eveline Sillevis Smitt - OmgevingswetAKD
 
Macro Analytics 03-09-13 - Hyperinflation w/ Ty Andros
Macro Analytics 03-09-13 - Hyperinflation w/ Ty AndrosMacro Analytics 03-09-13 - Hyperinflation w/ Ty Andros
Macro Analytics 03-09-13 - Hyperinflation w/ Ty AndrosGordonTLong.com
 
Market summary pptx 8.7.2014
Market summary pptx  8.7.2014Market summary pptx  8.7.2014
Market summary pptx 8.7.2014Ifb India
 
Serveur Weather Environnement Canada
Serveur Weather Environnement CanadaServeur Weather Environnement Canada
Serveur Weather Environnement Canadamontrealouvert
 
Comercio electrónico
Comercio electrónicoComercio electrónico
Comercio electrónicomiinee
 
Présentation opendata christiangendreau
Présentation opendata christiangendreauPrésentation opendata christiangendreau
Présentation opendata christiangendreaumontrealouvert
 
Hack reduce introduction
Hack reduce introductionHack reduce introduction
Hack reduce introductionmontrealouvert
 
DataMart - Miguel Tremblay - Environnement Canada
DataMart - Miguel Tremblay - Environnement CanadaDataMart - Miguel Tremblay - Environnement Canada
DataMart - Miguel Tremblay - Environnement Canadamontrealouvert
 
Precision Medicine: Opportunities and Challenges for Clinical Trials
Precision Medicine: Opportunities and Challenges for Clinical TrialsPrecision Medicine: Opportunities and Challenges for Clinical Trials
Precision Medicine: Opportunities and Challenges for Clinical TrialsMedpace
 
CityCamp & Hack 2014 - Cайт 112 го муниципалитета
CityCamp & Hack 2014 - Cайт 112 го муниципалитетаCityCamp & Hack 2014 - Cайт 112 го муниципалитета
CityCamp & Hack 2014 - Cайт 112 го муниципалитетаOpen City Foundation
 
наши достижения
наши достижениянаши достижения
наши достиженияZmeev88
 
Associação de Surdos do Porto
Associação de Surdos do PortoAssociação de Surdos do Porto
Associação de Surdos do PortoCarlos Alberto
 

Viewers also liked (17)

Casolego
CasolegoCasolego
Casolego
 
Programação 55° JASC - Alterada (7-12-2015)
Programação 55° JASC - Alterada (7-12-2015)Programação 55° JASC - Alterada (7-12-2015)
Programação 55° JASC - Alterada (7-12-2015)
 
habiba CV
habiba CVhabiba CV
habiba CV
 
Eveline Sillevis Smitt - Omgevingswet
Eveline Sillevis Smitt - OmgevingswetEveline Sillevis Smitt - Omgevingswet
Eveline Sillevis Smitt - Omgevingswet
 
Macro Analytics 03-09-13 - Hyperinflation w/ Ty Andros
Macro Analytics 03-09-13 - Hyperinflation w/ Ty AndrosMacro Analytics 03-09-13 - Hyperinflation w/ Ty Andros
Macro Analytics 03-09-13 - Hyperinflation w/ Ty Andros
 
Joost ouwerkerk
Joost ouwerkerk Joost ouwerkerk
Joost ouwerkerk
 
Market summary pptx 8.7.2014
Market summary pptx  8.7.2014Market summary pptx  8.7.2014
Market summary pptx 8.7.2014
 
Serveur Weather Environnement Canada
Serveur Weather Environnement CanadaServeur Weather Environnement Canada
Serveur Weather Environnement Canada
 
Comercio electrónico
Comercio electrónicoComercio electrónico
Comercio electrónico
 
Présentation opendata christiangendreau
Présentation opendata christiangendreauPrésentation opendata christiangendreau
Présentation opendata christiangendreau
 
Hack reduce introduction
Hack reduce introductionHack reduce introduction
Hack reduce introduction
 
DataMart - Miguel Tremblay - Environnement Canada
DataMart - Miguel Tremblay - Environnement CanadaDataMart - Miguel Tremblay - Environnement Canada
DataMart - Miguel Tremblay - Environnement Canada
 
Precision Medicine: Opportunities and Challenges for Clinical Trials
Precision Medicine: Opportunities and Challenges for Clinical TrialsPrecision Medicine: Opportunities and Challenges for Clinical Trials
Precision Medicine: Opportunities and Challenges for Clinical Trials
 
CityCamp & Hack 2014 - Cайт 112 го муниципалитета
CityCamp & Hack 2014 - Cайт 112 го муниципалитетаCityCamp & Hack 2014 - Cайт 112 го муниципалитета
CityCamp & Hack 2014 - Cайт 112 го муниципалитета
 
Practica 2 l
Practica 2 lPractica 2 l
Practica 2 l
 
наши достижения
наши достижениянаши достижения
наши достижения
 
Associação de Surdos do Porto
Associação de Surdos do PortoAssociação de Surdos do Porto
Associação de Surdos do Porto
 

Similar to Hack reduce mr-intro

Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...
Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...
Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...CloudxLab
 
Using Spark to Load Oracle Data into Cassandra
Using Spark to Load Oracle Data into CassandraUsing Spark to Load Oracle Data into Cassandra
Using Spark to Load Oracle Data into CassandraJim Hatcher
 
Using Spark to Load Oracle Data into Cassandra (Jim Hatcher, IHS Markit) | C*...
Using Spark to Load Oracle Data into Cassandra (Jim Hatcher, IHS Markit) | C*...Using Spark to Load Oracle Data into Cassandra (Jim Hatcher, IHS Markit) | C*...
Using Spark to Load Oracle Data into Cassandra (Jim Hatcher, IHS Markit) | C*...DataStax
 
Hadoop Integration in Cassandra
Hadoop Integration in CassandraHadoop Integration in Cassandra
Hadoop Integration in CassandraJairam Chandar
 
Full stack analytics with Hadoop 2
Full stack analytics with Hadoop 2Full stack analytics with Hadoop 2
Full stack analytics with Hadoop 2Gabriele Modena
 
Mapredtutorial
MapredtutorialMapredtutorial
MapredtutorialAnup Mohta
 
Cascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUGCascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUGMatthew McCullough
 
Stata cheatsheet programming
Stata cheatsheet programmingStata cheatsheet programming
Stata cheatsheet programmingTim Essam
 
Stata Programming Cheat Sheet
Stata Programming Cheat SheetStata Programming Cheat Sheet
Stata Programming Cheat SheetLaura Hughes
 
User Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryUser Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryDatabricks
 
User Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryUser Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryDatabricks
 
Introduction to Scalding and Monoids
Introduction to Scalding and MonoidsIntroduction to Scalding and Monoids
Introduction to Scalding and MonoidsHugo Gävert
 
Cs267 hadoop programming
Cs267 hadoop programmingCs267 hadoop programming
Cs267 hadoop programmingKuldeep Dhole
 
Easy Scaling with Open Source Data Structures, by Talip Ozturk
Easy Scaling with Open Source Data Structures, by Talip OzturkEasy Scaling with Open Source Data Structures, by Talip Ozturk
Easy Scaling with Open Source Data Structures, by Talip OzturkZeroTurnaround
 
ACADILD:: HADOOP LESSON
ACADILD:: HADOOP LESSON ACADILD:: HADOOP LESSON
ACADILD:: HADOOP LESSON Padma shree. T
 
MiamiJS - The Future of JavaScript
MiamiJS - The Future of JavaScriptMiamiJS - The Future of JavaScript
MiamiJS - The Future of JavaScriptCaridy Patino
 
External Language Stored Procedures for MySQL
External Language Stored Procedures for MySQLExternal Language Stored Procedures for MySQL
External Language Stored Procedures for MySQLAntony T Curtis
 

Similar to Hack reduce mr-intro (20)

Spark workshop
Spark workshopSpark workshop
Spark workshop
 
Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...
Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...
Writing MapReduce Programs using Java | Big Data Hadoop Spark Tutorial | Clou...
 
Using Spark to Load Oracle Data into Cassandra
Using Spark to Load Oracle Data into CassandraUsing Spark to Load Oracle Data into Cassandra
Using Spark to Load Oracle Data into Cassandra
 
Using Spark to Load Oracle Data into Cassandra (Jim Hatcher, IHS Markit) | C*...
Using Spark to Load Oracle Data into Cassandra (Jim Hatcher, IHS Markit) | C*...Using Spark to Load Oracle Data into Cassandra (Jim Hatcher, IHS Markit) | C*...
Using Spark to Load Oracle Data into Cassandra (Jim Hatcher, IHS Markit) | C*...
 
Hadoop Integration in Cassandra
Hadoop Integration in CassandraHadoop Integration in Cassandra
Hadoop Integration in Cassandra
 
Full stack analytics with Hadoop 2
Full stack analytics with Hadoop 2Full stack analytics with Hadoop 2
Full stack analytics with Hadoop 2
 
Mapredtutorial
MapredtutorialMapredtutorial
Mapredtutorial
 
Cascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUGCascading Through Hadoop for the Boulder JUG
Cascading Through Hadoop for the Boulder JUG
 
Stata cheatsheet programming
Stata cheatsheet programmingStata cheatsheet programming
Stata cheatsheet programming
 
Stata Programming Cheat Sheet
Stata Programming Cheat SheetStata Programming Cheat Sheet
Stata Programming Cheat Sheet
 
User Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryUser Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love Story
 
User Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love StoryUser Defined Aggregation in Apache Spark: A Love Story
User Defined Aggregation in Apache Spark: A Love Story
 
Introduction to Scalding and Monoids
Introduction to Scalding and MonoidsIntroduction to Scalding and Monoids
Introduction to Scalding and Monoids
 
spring-tutorial
spring-tutorialspring-tutorial
spring-tutorial
 
Cs267 hadoop programming
Cs267 hadoop programmingCs267 hadoop programming
Cs267 hadoop programming
 
Easy Scaling with Open Source Data Structures, by Talip Ozturk
Easy Scaling with Open Source Data Structures, by Talip OzturkEasy Scaling with Open Source Data Structures, by Talip Ozturk
Easy Scaling with Open Source Data Structures, by Talip Ozturk
 
ACADILD:: HADOOP LESSON
ACADILD:: HADOOP LESSON ACADILD:: HADOOP LESSON
ACADILD:: HADOOP LESSON
 
Amazon elastic map reduce
Amazon elastic map reduceAmazon elastic map reduce
Amazon elastic map reduce
 
MiamiJS - The Future of JavaScript
MiamiJS - The Future of JavaScriptMiamiJS - The Future of JavaScript
MiamiJS - The Future of JavaScript
 
External Language Stored Procedures for MySQL
External Language Stored Procedures for MySQLExternal Language Stored Procedures for MySQL
External Language Stored Procedures for MySQL
 

More from montrealouvert

5 @ 7 Geek à Sid Lee Technologies
5 @ 7 Geek à Sid Lee Technologies5 @ 7 Geek à Sid Lee Technologies
5 @ 7 Geek à Sid Lee Technologiesmontrealouvert
 
5@7 Données Ouvertes Montréal Juin
5@7 Données Ouvertes Montréal Juin5@7 Données Ouvertes Montréal Juin
5@7 Données Ouvertes Montréal Juinmontrealouvert
 
Contrats net - analyse des contrats
Contrats net - analyse des contratsContrats net - analyse des contrats
Contrats net - analyse des contratsmontrealouvert
 
Journée de la culture ouverte - Luc Gauvreau
Journée de la culture ouverte - Luc GauvreauJournée de la culture ouverte - Luc Gauvreau
Journée de la culture ouverte - Luc Gauvreaumontrealouvert
 
Données Ouvertes et les terrains contaminés
Données Ouvertes et les terrains contaminés Données Ouvertes et les terrains contaminés
Données Ouvertes et les terrains contaminés montrealouvert
 
Conférence corruption Institut du nouveau monde (INM)
Conférence corruption Institut du nouveau monde (INM)Conférence corruption Institut du nouveau monde (INM)
Conférence corruption Institut du nouveau monde (INM)montrealouvert
 
Allumer - Présentation de LDAC à Hackons la Corrutpion
Allumer - Présentation de LDAC à Hackons la CorrutpionAllumer - Présentation de LDAC à Hackons la Corrutpion
Allumer - Présentation de LDAC à Hackons la Corrutpionmontrealouvert
 
Jean Fortier Hackons la Corruption
Jean Fortier Hackons la CorruptionJean Fortier Hackons la Corruption
Jean Fortier Hackons la Corruptionmontrealouvert
 
Présentation par Nord Ouvert - Hackons la corruption
Présentation par Nord Ouvert - Hackons la corruptionPrésentation par Nord Ouvert - Hackons la corruption
Présentation par Nord Ouvert - Hackons la corruptionmontrealouvert
 
Ffctn hackons la-corruption
Ffctn hackons la-corruptionFfctn hackons la-corruption
Ffctn hackons la-corruptionmontrealouvert
 
Communautaire médias sociaux et démocratie directe
Communautaire médias sociaux et démocratie directeCommunautaire médias sociaux et démocratie directe
Communautaire médias sociaux et démocratie directemontrealouvert
 
Congrès des archivestes
Congrès des archivestesCongrès des archivestes
Congrès des archivestesmontrealouvert
 
Première rencontre publique Québec Ouvert
Première rencontre publique Québec OuvertPremière rencontre publique Québec Ouvert
Première rencontre publique Québec Ouvertmontrealouvert
 
How to build an open data movement in your city, state, or province OKFN data...
How to build an open data movement in your city, state, or province OKFN data...How to build an open data movement in your city, state, or province OKFN data...
How to build an open data movement in your city, state, or province OKFN data...montrealouvert
 
Présentation avec l'équipe Gautrin à l'Assemblée Nationale à Québec
Présentation avec l'équipe Gautrin à l'Assemblée Nationale à QuébecPrésentation avec l'équipe Gautrin à l'Assemblée Nationale à Québec
Présentation avec l'équipe Gautrin à l'Assemblée Nationale à Québecmontrealouvert
 
WebÉduction Données ouvertes enjeux
WebÉduction Données ouvertes enjeuxWebÉduction Données ouvertes enjeux
WebÉduction Données ouvertes enjeuxmontrealouvert
 
WebÉducation Gouvernement 2.0
WebÉducation Gouvernement 2.0WebÉducation Gouvernement 2.0
WebÉducation Gouvernement 2.0montrealouvert
 
Présentation à WebEducation avril 2011
Présentation à WebEducation avril 2011Présentation à WebEducation avril 2011
Présentation à WebEducation avril 2011montrealouvert
 
Intracom 2011 - ActionTI - Donées Ouvertes
Intracom 2011 - ActionTI - Donées OuvertesIntracom 2011 - ActionTI - Donées Ouvertes
Intracom 2011 - ActionTI - Donées Ouvertesmontrealouvert
 
Conférence LegalIT 5.0_présentation_MontréalOuvert
Conférence LegalIT 5.0_présentation_MontréalOuvertConférence LegalIT 5.0_présentation_MontréalOuvert
Conférence LegalIT 5.0_présentation_MontréalOuvertmontrealouvert
 

More from montrealouvert (20)

5 @ 7 Geek à Sid Lee Technologies
5 @ 7 Geek à Sid Lee Technologies5 @ 7 Geek à Sid Lee Technologies
5 @ 7 Geek à Sid Lee Technologies
 
5@7 Données Ouvertes Montréal Juin
5@7 Données Ouvertes Montréal Juin5@7 Données Ouvertes Montréal Juin
5@7 Données Ouvertes Montréal Juin
 
Contrats net - analyse des contrats
Contrats net - analyse des contratsContrats net - analyse des contrats
Contrats net - analyse des contrats
 
Journée de la culture ouverte - Luc Gauvreau
Journée de la culture ouverte - Luc GauvreauJournée de la culture ouverte - Luc Gauvreau
Journée de la culture ouverte - Luc Gauvreau
 
Données Ouvertes et les terrains contaminés
Données Ouvertes et les terrains contaminés Données Ouvertes et les terrains contaminés
Données Ouvertes et les terrains contaminés
 
Conférence corruption Institut du nouveau monde (INM)
Conférence corruption Institut du nouveau monde (INM)Conférence corruption Institut du nouveau monde (INM)
Conférence corruption Institut du nouveau monde (INM)
 
Allumer - Présentation de LDAC à Hackons la Corrutpion
Allumer - Présentation de LDAC à Hackons la CorrutpionAllumer - Présentation de LDAC à Hackons la Corrutpion
Allumer - Présentation de LDAC à Hackons la Corrutpion
 
Jean Fortier Hackons la Corruption
Jean Fortier Hackons la CorruptionJean Fortier Hackons la Corruption
Jean Fortier Hackons la Corruption
 
Présentation par Nord Ouvert - Hackons la corruption
Présentation par Nord Ouvert - Hackons la corruptionPrésentation par Nord Ouvert - Hackons la corruption
Présentation par Nord Ouvert - Hackons la corruption
 
Ffctn hackons la-corruption
Ffctn hackons la-corruptionFfctn hackons la-corruption
Ffctn hackons la-corruption
 
Communautaire médias sociaux et démocratie directe
Communautaire médias sociaux et démocratie directeCommunautaire médias sociaux et démocratie directe
Communautaire médias sociaux et démocratie directe
 
Congrès des archivestes
Congrès des archivestesCongrès des archivestes
Congrès des archivestes
 
Première rencontre publique Québec Ouvert
Première rencontre publique Québec OuvertPremière rencontre publique Québec Ouvert
Première rencontre publique Québec Ouvert
 
How to build an open data movement in your city, state, or province OKFN data...
How to build an open data movement in your city, state, or province OKFN data...How to build an open data movement in your city, state, or province OKFN data...
How to build an open data movement in your city, state, or province OKFN data...
 
Présentation avec l'équipe Gautrin à l'Assemblée Nationale à Québec
Présentation avec l'équipe Gautrin à l'Assemblée Nationale à QuébecPrésentation avec l'équipe Gautrin à l'Assemblée Nationale à Québec
Présentation avec l'équipe Gautrin à l'Assemblée Nationale à Québec
 
WebÉduction Données ouvertes enjeux
WebÉduction Données ouvertes enjeuxWebÉduction Données ouvertes enjeux
WebÉduction Données ouvertes enjeux
 
WebÉducation Gouvernement 2.0
WebÉducation Gouvernement 2.0WebÉducation Gouvernement 2.0
WebÉducation Gouvernement 2.0
 
Présentation à WebEducation avril 2011
Présentation à WebEducation avril 2011Présentation à WebEducation avril 2011
Présentation à WebEducation avril 2011
 
Intracom 2011 - ActionTI - Donées Ouvertes
Intracom 2011 - ActionTI - Donées OuvertesIntracom 2011 - ActionTI - Donées Ouvertes
Intracom 2011 - ActionTI - Donées Ouvertes
 
Conférence LegalIT 5.0_présentation_MontréalOuvert
Conférence LegalIT 5.0_présentation_MontréalOuvertConférence LegalIT 5.0_présentation_MontréalOuvert
Conférence LegalIT 5.0_présentation_MontréalOuvert
 

Hack reduce mr-intro

  • 1. HackReduce M a p R e d u c e I n t r o Hopper.com (Greg Lu)
  • 2. Project github.com/hackreduce/Hackathon Wiki github.com/hackreduce/Hackathon/wiki Download the Github project for some sample datasets
  • 3. datasets/nasdaq/daily_prices/NASDAQ_daily_prices_subset.csv } NASDAQ,DELL,1997-08-26,83.87,84.75,82.50,82.81,48736000,10.35 NASDAQ,DITC,2002-10-24,1.56,1.69,1.53,1.60,133600,1.60 NASDAQ,DLIA,2008-01-28,1.91,2.31,1.91,2.23,760800,2.23 InputSplit 1 NASDAQ,DWCH,2002-07-10,3.09,3.14,3.09,3.14,2400,1.57 } NASDAQ,DYNT,2008-12-29,0.31,0.31,0.29,0.30,26900,0.30 NASDAQ,DMLP,2003-10-21,17.65,17.94,17.58,17.59,4800,9.73 NASDAQ,DORM,1997-02-07,7.88,7.88,7.63,7.75,7400,3.87 InputSplit 2 NASDAQ,DXPE,2004-10-25,5.19,5.24,5.00,5.00,7600,2.50 } NASDAQ,DEST,2009-03-17,4.55,5.03,4.55,5.03,6800,5.03 NASDAQ,DBRN,1992-01-02,8.88,9.25,8.75,8.88,84800,2.22 NASDAQ,DXYN,1998-11-25,6.38,6.44,6.19,6.25,211100,6.25 InputSplit 3 NASDAQ,DEAR,1998-12-08,10.50,11.50,10.50,10.50,5800,6.45 ... org.hackreduce.examples.stockexchange.MarketCapitalization (expanded version) public int run(String[] args) throws Exception { Configuration conf = getConf(); if (args.length != 2) { System.err.println("Usage: " + getClass().getName() + " <input> <output>"); System.exit(2); } // Creating the MapReduce job (configuration) object Job job = new Job(conf); job.setJarByClass(getClass()); job.setJobName(getClass().getName()); } Defines how the data is split // The Nasdaq/NYSE data dumps comes in as a CSV file (text input), so we configure // the job to use this format. job.setInputFormatClass(TextInputFormat.class); and assigned to which mappers [...]
  • 4. datasets/nasdaq/daily_prices/NASDAQ_daily_prices_subset.csv datasets/nasdaq/daily_prices } NASDAQ,DELL,1997-08-26,83.87,84.75,82.50,82.81,48736000,10.35 NASDAQ,DITC,2002-10-24,1.56,1.69,1.53,1.60,133600,1.60 NASDAQ,DLIA,2008-01-28,1.91,2.31,1.91,2.23,760800,2.23 InputSplit 1 NASDAQ,DWCH,2002-07-10,3.09,3.14,3.09,3.14,2400,1.57 org.hackreduce.examples.stockexchange.MarketCapitalization (expanded version) public int run(String[] args) throws Exception { [...] // Tell the job which Mapper and Reducer to use (classes defined above) job.setMapperClass(MarketCapitalizationMapper.class); job.setReducerClass(MarketCapitalizationReducer.class); } Point the job to the custom classes that we created in order to process the data. } // This is what the Mapper will be outputting to the Reducer job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(DoubleWritable.class); Define the types of the (key, value) // This is what the Reducer will be outputting pairs that we’ll be outputting from the job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); mappers and the result of the job itself. // Setting the input folder of the job FileInputFormat.addInputPath(job, new Path(args[0])); // Preparing the output folder by first deleting it if it exists Path output = new Path(args[1]); FileSystem.get(conf).delete(output, true); FileOutputFormat.setOutputPath(job, output); Now we’ll show the MarketCapitalizationMapper class
  • 5. datasets/nasdaq/daily_prices/NASDAQ_daily_prices_subset.csv datasets/nasdaq/daily_prices } NASDAQ,DELL,1997-08-26,83.87,84.75,82.50,82.81,48736000,10.35 NASDAQ,DITC,2002-10-24,1.56,1.69,1.53,1.60,133600,1.60 NASDAQ,DLIA,2008-01-28,1.91,2.31,1.91,2.23,760800,2.23 InputSplit 1 NASDAQ,DWCH,2002-07-10,3.09,3.14,3.09,3.14,2400,1.57 org.hackreduce.examples.stockexchange.MarketCapitalization (expanded version) public static class MarketCapitalizationMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> { protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String inputString = value.toString(); String[] attributes = inputString.split(","); if (attributes.length != 9) throw new IllegalArgumentException("Input string given did not have 9 values in CSV format"); try { String exchange = attributes[0]; String stockSymbol = attributes[1]; Date date = sdf.parse(attributes[2]); double stockPriceOpen = Double.parseDouble(attributes[3]); double stockPriceHigh = Double.parseDouble(attributes[4]); double stockPriceLow = Double.parseDouble(attributes[5]); double stockPriceClose = Double.parseDouble(attributes[6]); int stockVolume = Integer.parseInt(attributes[7]); double stockPriceAdjClose = Double.parseDouble(attributes[8]); } catch (ParseException e) { throw new IllegalArgumentException("Input string contained an unknown value that couldn't be parsed"); } catch (NumberFormatException e) { throw new IllegalArgumentException("Input string contained an unknown number value that couldn't be parsed"); } double marketCap = stockPriceClose * stockVolume; context.write(new Text(stockSymbol), new DoubleWritable(marketCap)); } This job doesn’t do a whole lot, } but this is where the processing } is occurring.
  • 6. datasets/nasdaq/daily_prices/NASDAQ_daily_prices_subset.csv datasets/nasdaq/daily_prices } NASDAQ,DELL,1997-08-26,83.87,84.75,82.50,82.81,48736000,10.35 NASDAQ,DITC,2002-10-24,1.56,1.69,1.53,1.60,133600,1.60 NASDAQ,DLIA,2008-01-28,1.91,2.31,1.91,2.23,760800,2.23 InputSplit 1 NASDAQ,DWCH,2002-07-10,3.09,3.14,3.09,3.14,2400,1.57 (line-by-line) MarketCapitalizationMapper (emits) (DELL, 82.81*48736000) (DITC, 1.60*133600) (DLIA, 2.23*760800) (DWCH, 3.14*2400) (sorted and partitioned to specific reducers) MarketCapitalizationReducer
  • 7. (coming from different mappers) (DELL, 82.81*48736000) (DELL, 31.92*18678500) (DELL, 23.85*16038700) (DELL, 30.38*68759800) (...) (but arriving at the same reducer) org.hackreduce.examples.stockexchange.MarketCapitalization (expanded version) public static class MarketCapitalizationReducer extends Reducer<Text, DoubleWritable, Text, Text> { NumberFormat currencyFormat = NumberFormat.getCurrencyInstance(Locale.getDefault()); @Override protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException { double highestCap = 0.0; for (DoubleWritable value : values) { highestCap = Math.max(highestCap, value.get()); } context.write(key, new Text(currencyFormat.format(highestCap))); } } (output of this reducer) (DELL, $4,035,828,160.00)
  • 8. /tmp/nasdaq_marketcaps/part-r-00000 DAIO $1,515,345.00 DAKT $63,656,600.00 DANKY $89,668,857.00 DARA $1,464,720.00 DASTY $14,141,055.00 DATA $2,888,325.00 DAVE $5,144,800.00 DBLE $1,040,996.00 DBLEP $79,584.00 DBRN $131,023,326.00 DBTK $7,405,366.00 DCAI $20,058,990.00 DCGN $10,372,992.00 DCOM $12,298,208.00 DCTH $3,285,652.00 DDDC $79,176.00 DDIC $3,684,100.00 DDMX $7,811,204.00 DDRX $12,480,500.00 DDSS $4,545,438.00 DEAR $4,375,800.00 DECK $271,081,580.00 DEER $5,363,740.00 DEIX $5,285,892.00
  • 9. We can dynamically increase your clusters if you need the processing power, but it’s typically bottlenecked by the code. If your job takes longer than 10 minutes to run, come see us.