博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Hadoop_21_MapReduce程序实现Join功能
阅读量:6534 次
发布时间:2019-06-24

本文共 8662 字,大约阅读时间需要 28 分钟。

1.

1.1.hadoop的序列化格式

  序列化和反序列化就是结构化对象和字节流之间的转换,主要用在内部进程的通讯和持久化存储方面

  hadoop在节点间的内部通讯使用的是RPC,RPC协议把消息翻译成二进制字节流发送到远程节点,远程节点再通过反序
列化把二进制流转成原始的信息  
  hadoop自身的序列化存储格式实现了Writable接口的类,他只实现了前面压缩和快速。但是不容易扩展也不跨语言
  我们先来看下Writable接口,Writable接口定义了两个方法:
  1.将数据写入到二进制流中
  2.从二进制数据流中读取数据
  

2.reduce端join算法实现

1.需求:

 

 假如数据量巨大,两表的数据是以文件的形式存储在HDFS中,需要用mapreduce程序来实现以下SQL查询运算:

   select  a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id

2.实现机制:

  通过将关联的条件pid作为map输出的key,将两表满足join条件的数据并携带数据所来源的文件信息,发往同

一个reducetask,在reduce中进行数据的串联

3.代码实现:

package cn.bigdata.mr.rjoin;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;import org.apache.hadoop.io.Writable;public class InfoBean implements Writable {    private int order_id;    private String dateString;    private String p_id;    private int amount;    private String pname;    private int category_id;    private float price;    // flag=0表示这个对象是封装订单表记录    // flag=1表示这个对象是封装产品信息记录    private String flag;    public InfoBean() {    }    public void set(int order_id, String dateString, String p_id, int amount, String pname, int category_id, float price, String flag) {        this.order_id = order_id;        this.dateString = dateString;        this.p_id = p_id;        this.amount = amount;        this.pname = pname;        this.category_id = category_id;        this.price = price;        this.flag = flag;    }    public int getOrder_id() {        return order_id;    }    public void setOrder_id(int order_id) {        this.order_id = order_id;    }    public String getDateString() {        return dateString;    }    public void setDateString(String dateString) {        this.dateString = dateString;    }    public String getP_id() {        return p_id;    }    public void setP_id(String p_id) {        this.p_id = p_id;    }    public int getAmount() {        return amount;    }    public void setAmount(int amount) {        this.amount = amount;    }    public String getPname() {        return pname;    }    public void setPname(String pname) {        this.pname = pname;    }    public int getCategory_id() {        return category_id;    }    public void setCategory_id(int category_id) {        this.category_id = category_id;    }    public float getPrice() {        return price;    }    public void setPrice(float price) {        this.price = price;    }    public String getFlag() {        return flag;    }    public void setFlag(String flag) {        this.flag = flag;    }    /**     * private int order_id; private String dateString; private int p_id;     * private int amount; private String pname; private int category_id;     * private float price;     */    @Override    public void write(DataOutput out) throws IOException {        out.writeInt(order_id);        out.writeUTF(dateString);        out.writeUTF(p_id);        out.writeInt(amount);        out.writeUTF(pname);        out.writeInt(category_id);        out.writeFloat(price);        out.writeUTF(flag);    }    @Override    public void readFields(DataInput in) throws IOException {        this.order_id = in.readInt();        this.dateString = in.readUTF();        this.p_id = in.readUTF();        this.amount = in.readInt();        this.pname = in.readUTF();        this.category_id = in.readInt();        this.price = in.readFloat();        this.flag = in.readUTF();    }    @Override    public String toString() {        return "order_id=" + order_id + ", dateString=" + dateString + ", p_id=" + p_id + ", amount=" + amount + ", pname=" + pname + ", category_id=" + category_id + ", price=" + price ;    }}
package cn.bigdata.mr.rjoin;import java.io.IOException;import java.util.ArrayList;import org.apache.commons.beanutils.BeanUtils;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.LongWritable;import org.apache.hadoop.io.NullWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.input.FileSplit;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;/** * 订单表和商品表合到一起    order.txt(订单id, 日期, 商品编号, 数量)        1001    20150710    P0001    2        1002    20150710    P0001    3        1002    20150710    P0002    3        1003    20150710    P0003    3    product.txt(商品编号, 商品名字, 价格, 数量)        P0001    小米5    1001    2        P0002    锤子T1    1000    3        P0003    锤子    1002    4 */public class RJoin {    static class RJoinMapper extends Mapper
{ InfoBean bean = new InfoBean(); Text k = new Text(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); FileSplit inputSplit = (FileSplit) context.getInputSplit(); String name = inputSplit.getPath().getName(); System.out.println("kkkkkkkkkkkkkkkkkkkkkk"+name); // 通过文件名判断是哪种数据 String pid = ""; if (name.startsWith("order")) { String[] fields = line.split(","); // id date pid amount pid = fields[2]; bean.set(Integer.parseInt(fields[0]), fields[1], pid, Integer.parseInt(fields[3]), "", 0, 0, "0"); } else { String[] fields = line.split(","); // id pname category_id price pid = fields[0]; bean.set(0, "", pid, 0, fields[1], Integer.parseInt(fields[2]), Float.parseFloat(fields[3]), "1"); } k.set(pid); context.write(k, bean); } } static class RJoinReducer extends Reducer
{ @Override protected void reduce(Text pid, Iterable
beans, Context context) throws IOException, InterruptedException { InfoBean pdBean = new InfoBean(); ArrayList
orderBeans = new ArrayList
(); for (InfoBean bean : beans) { if ("1".equals(bean.getFlag())) { //产品的 try { BeanUtils.copyProperties(pdBean, bean); } catch (Exception e) { e.printStackTrace(); } } else { InfoBean odbean = new InfoBean(); try { BeanUtils.copyProperties(odbean, bean); orderBeans.add(odbean); } catch (Exception e) { e.printStackTrace(); } } } // 拼接两类数据形成最终结果 for (InfoBean bean : orderBeans) { bean.setPname(pdBean.getPname()); bean.setCategory_id(pdBean.getCategory_id()); bean.setPrice(pdBean.getPrice()); context.write(bean, NullWritable.get()); } } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); conf.set("mapred.textoutputformat.separator", ","); Job job = Job.getInstance(conf); // 指定本程序的jar包所在的本地路径 // job.setJarByClass(RJoin.class);// job.setJar("c:/join.jar"); job.setJarByClass(RJoin.class); // 指定本业务job要使用的mapper/Reducer业务类 job.setMapperClass(RJoinMapper.class); job.setReducerClass(RJoinReducer.class); // 指定mapper输出数据的kv类型 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(InfoBean.class); // 指定最终输出的数据的kv类型 job.setOutputKeyClass(InfoBean.class); job.setOutputValueClass(NullWritable.class); // 指定job的输入原始文件所在目录 FileInputFormat.setInputPaths(job, new Path(args[0])); // 指定job的输出结果所在目录 FileOutputFormat.setOutputPath(job, new Path(args[1])); // 将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行 /* job.submit(); */ boolean res = job.waitForCompletion(true); System.exit(res ? 0 : 1); }}

运行结果:

order_id=1002, dateString=20150710, p_id=P0001, amount=3, pname=sss, category_id=1001, price=2.0

order_id=1001, dateString=20150710, p_id=P0001, amount=2, pname=sss, category_id=1001, price=2.0
order_id=1002, dateString=20150710, p_id=P0002, amount=3, pname=111, category_id=1000, price=3.0
order_id=1003, dateString=20150710, p_id=P0003, amount=3, pname=www, category_id=1002, price=4.0

 

  

 

 

 

 

 

 

 

 

 

 

 

 

转载于:https://www.cnblogs.com/yaboya/p/9241740.html

你可能感兴趣的文章
SilverLigth学习笔记--控制 Silverlight控件样式(转)
查看>>
poj3262
查看>>
第四十天笔记
查看>>
4、动态代理
查看>>
Loj #6073.「2017 山东一轮集训 Day5」距离
查看>>
我的TCP/IP学习笔记
查看>>
shell--字符串的截取变量子串串
查看>>
Cas_个人理解
查看>>
UISearchController
查看>>
梦断代码阅读笔记02
查看>>
轮毂电机光电增量编码器的ABZ信号详解
查看>>
TextBox Template
查看>>
Linux MySQL 储存中文失败简单解决办法
查看>>
洛谷——P1330 封锁阳光大学
查看>>
css选择器
查看>>
zabbix-agent配置文件说明
查看>>
linux系统配置之bash shell的配置(centos)
查看>>
linux C 9*9
查看>>
hdu 1695: GCD 【莫比乌斯反演】
查看>>
python的string操作总结
查看>>