Bridging Between CAD & GIS: 6 Ways to Automate Your Data Integration
AJUG April 2011 Cascading example
1. package org.ajug;
import cascading.cascade.Cascade;
import cascading.cascade.CascadeConnector;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.scheme.TextDelimited;
import cascading.scheme.TextLine;
import cascading.tap.Hfs;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import java.util.Properties;
public class Main {
public static void main(String[] args) {
Properties properties = new Properties();
FlowConnector.setApplicationJarClass(properties, Main.class);
properties.put("mapred.reduce.tasks", 5);
Pipe mainPipe = new Each("M&M", new Fields("line"), new Parser());
mainPipe = new GroupBy(mainPipe, new Fields("COLOR"));
mainPipe = new Every(mainPipe, Fields.ALL, new ColorAggregator(new
ColorData()));
Tap sourceTap = new Hfs(new TextLine(), args[0]);
TextDelimited scheme = new TextDelimited(new Fields("COLOR",
"AVG_WIDTH", "AVG_WEIGHT"), ",", """);
scheme.setNumSinkParts(1); // make sure we only get one file
Tap sinkTap = new Hfs(scheme, args[1], SinkMode.REPLACE);
FlowConnector flowConnector = new FlowConnector(properties);
CascadeConnector cascadeConnector = new CascadeConnector();
Flow flow = flowConnector.connect(sourceTap, sinkTap, mainPipe);
Cascade cascade = cascadeConnector.connect(flow);
cascade.complete(); // Finally run everything
}
}
================================================
package org.ajug;
import cascading.cascade.Cascade;
import cascading.cascade.CascadeConnector;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.pipe.*;
import cascading.scheme.TextDelimited;
2. import cascading.scheme.TextLine;
import cascading.tap.Hfs;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
public class MultiOutputMain {
public static void main(String[] args) {
Properties properties = new Properties();
FlowConnector.setApplicationJarClass(properties, Main.class);
properties.put("mapred.reduce.tasks", 5);
Pipe sourcePipe = new Each("M&M", new Fields("line"), new Parser());
Pipe totalPipe = new GroupBy("Total", sourcePipe, new Fields("ONE"));
totalPipe = new Every(totalPipe, Fields.ALL, new TotalAggregator(new
TotalData()));
Pipe mainPipe = new GroupBy("Color", sourcePipe, new Fields("COLOR"));
mainPipe = new Every(mainPipe, Fields.ALL, new ColorAggregator(new
ColorData()));
Tap sourceTap = new Hfs(new TextLine(), args[0]);
TextDelimited scheme = new TextDelimited(new Fields("COLOR",
"AVG_WIDTH", "AVG_WEIGHT"), ",", """);
scheme.setNumSinkParts(1); // make sure we only get one file
Tap colorTap = new Hfs(scheme, args[1] + "/color", SinkMode.REPLACE);
TextDelimited totalScheme = new TextDelimited(new
Fields("FINAL_WIDTH", "FINAL_WEIGHT"), ",", """);
totalScheme.setNumSinkParts(1); // make sure we only get one file
Tap totalTap = new Hfs(totalScheme, args[1] + "/total",
SinkMode.REPLACE);
FlowConnector flowConnector = new FlowConnector(properties);
CascadeConnector cascadeConnector = new CascadeConnector();
Map<String, Tap> outputs = new HashMap<String, Tap>();
outputs.put(totalPipe.getName(), totalTap);
outputs.put(mainPipe.getName(), colorTap);
Flow flow = flowConnector.connect(sourceTap, outputs, totalPipe,
mainPipe);
Cascade cascade = cascadeConnector.connect(flow);
cascade.complete(); // Finally run everything
}
}
=======================================
package org.ajug;
3. import cascading.flow.FlowProcess;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import java.io.Serializable;
public class Parser extends cascading.operation.BaseOperation implements
Serializable, Function {
public Parser() {
super(new Fields("ONE","COLOR", "WIDTH", "WEIGHT")); // should be
constants file ;)
}
public void operate(FlowProcess a_flow, FunctionCall a_call) {
String sourceData = a_call.getArguments().getString(0);
sourceData = sourceData.trim();
if (sourceData == null || sourceData.length() == 0) {
return; // blank line read from the source file, so ignore
it
}
String values[] = sourceData.split(",");
Tuple output = new Tuple();
output.add("1");
output.add(values[0]);
output.add(values[1]);
output.add(values[2]);
a_call.getOutputCollector().add(output);
}
}
==============================================
package org.ajug;
import cascading.tuple.Tuple;
import java.io.Serializable;
public class ColorData implements Serializable {
private long m_num = 0;
private double m_width = 0;
private double m_weight = 0;
public void reset(){
m_num = 0;
m_width = 0;
m_weight = 0;
}
4. public void addData(double a_weight, double a_width){
m_weight += a_weight;
m_width+=a_width;
m_num++;
}
public Tuple getTuple() {
if (m_num == 0) return null;
Tuple rtnValue = new Tuple();
rtnValue.add(m_width/m_num);
rtnValue.add(m_weight/m_num);
return rtnValue;
}
}
===============================================
package org.ajug;
import cascading.tuple.Tuple;
import java.io.Serializable;
public class TotalData implements Serializable {
private long m_num = 0;
private double m_width = 0;
private double m_weight = 0;
public void reset(){
m_num = 0;
m_width = 0;
m_weight = 0;
}
public void addData(double a_weight, double a_width){
m_weight += a_weight;
m_width+=a_width;
m_num++;
}
public Tuple getTuple() {
if (m_num == 0) return null;
Tuple rtnValue = new Tuple();
rtnValue.add(m_width/m_num);
rtnValue.add(m_weight/m_num);
return rtnValue;
}
}
==================================================
package org.ajug;