本文主要是介绍二、数据仓库电商项目——ODS层,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
ODS层
ODS层是整个阶段最简单一层分层,主要职责是建立数据原始层,将数据都保存在HDFS上,经过步骤1我们建立出来了两个数据文件夹,一个是db,一个是log,分别代表业务数据、用户行为数据。
ODS层的特点:
(1)保持数据原貌不做任何修改,起到备份数据的作用。
(2)数据采用压缩,减少磁盘存储空间(例如:原始数据100G,可以压缩到10G左右)
(3)创建分区表,防止后续的全表扫描
ODS层用户行为日志数据,日志文件—>HDFS—>ODS落盘
在用户行为数据中,
需要创建两张表,一张是启动数据表,一张是事件行为表。在最后进行脚本的编写。
分解需求:当前需要建立两张表ods_start_log,ods_event_log,采用LZO压缩。落盘路径为:/warehouse/gamll/ods/ods_start_log
创建启动日志表ods_start_log
CREATE DATABASE gmall;
use gmall;DROP TABLE IF EXISTS ods_start_log;
CREATE EXTERNAL TABLE ods_start_log (line string)
partitioned by (dt string)
STORED AS INPUTFORMAT "com.hadoop.mapred.DeprecatedLzoTextInputFormat"
OUTPUTFORMAT "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
LOCATION '/warehouse/gamll/ods/ods_start_log';LOAD DATA INPATH '/origin_data_cdh/gmall/log/topic_start/2020-11-19' INTO TABLE ods_start_log PARTITION (dt='2020-11-19');
LOAD DATA INPATH '/origin_data_cdh/gmall/log/topic_start/2020-11-20' INTO TABLE ods_start_log PARTITION (dt='2020-11-20');SELECT * FROM ods_start_log WHERE dt='2020-11-19' LIMIT 2;
创建LZO索引:
sudo -u hdfs hadoop jar /opt/module/LZO/hadoop-lzo-0.4.15-cdh5.8.3.jar com.hadoop.compression.lzo.LzoIndexer /warehouse/gamll/ods/ods_start_log/dt=2020-11-19
创建事件日志表ods_event_log 同上
创建脚本 hdfs_to_ods_log.sh
#!/bin/bash# 定义变量方便修改
APP=gmall
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;thendo_date=$1
elsedo_date=`date -d "-1 day" +%F`
fi
echo "===说明:ods层用户行为日志导入==="
echo "===日志日期为 $do_date==="
sql="
load data inpath '/origin_data_cdh/gmall/log/topic_start/$do_date' overwrite into table ${APP}.ods_start_log partition(dt='$do_date');load data inpath '/origin_data_cdh/gmall/log/topic_event/$do_date' overwrite into table ${APP}.ods_event_log partition(dt='$do_date');
"sudo -u hive hive -e "$sql"#创建索引
echo "===说明:$do_date ods层 启动日志 创建LZO索引......==="
sudo -u hdfs hadoop jar /opt/module/LZO/hadoop-lzo-0.4.15-cdh5.8.3.jar com.hadoop.compression.lzo.LzoIndexer /warehouse/gamll/ods/ods_start_log/dt=$do_date
echo "===说明:$do_date ods层 事件日志 创建LZO索引......==="
sudo -u hdfs hadoop jar /opt/module/LZO/hadoop-lzo-0.4.15-cdh5.8.3.jar com.hadoop.compression.lzo.LzoIndexer /warehouse/gamll/ods/ods_event_log/dt=$do_date
ODS层业务数据
将MySQL数据库中的业务数据通过sqoop全部罗盘到HDFS上,格式整齐,并且建立了LZO索引,现在我们想将这些数据通过Hive清洗成ODS层的数据
电商表结构(24张表):
订单表(增量及更新)
MySQL中的order_info
建表过程中筛选的字段,你会发现并不是所有的字段,数据清洗只挑选有用
的字段。
drop TABLE IF EXISTS ods_order_info;
CREATE EXTERNAL TABLE ods_order_info(
id STRING COMMENT '订单号',
fina_total_amount DECIMAL(10,2) COMMENT '订单金额',
order_status STRING COMMENT '订单状态',
user_id STRING COMMENT '用户id',
out_trade_no STRING COMMENT '支付流水号',
create_time STRING COMMENT '创建时间',
operate_time STRING COMMENT '操作时间',
province_id STRING COMMENT '省份id',
benefit_reduce_amount DECIMAL(10,2) COMMENT '优惠金额',
original_total_amount DECIMAL(10,2) COMMENT '原始金额',
feight_fee DECIMAL(10,2) COMMENT '运费金额'
)comment '订单表'
PARTITIONED BY (dt STRING)
ROW FORMAT DELIMITED FIELDS TERMINATED BY'\t'
STORED AS INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION '/warehouse/gamll/ods/ods_oder_info';
订单详情表表(增量)
drop table if exists ods_order_detail;
create external table ods_order_detail( `id` string COMMENT '订单编号',`order_id` string COMMENT '订单号', `user_id` string COMMENT '用户id',`sku_id` string COMMENT '商品id',`sku_name` string COMMENT '商品名称',`order_price` decimal(10,2) COMMENT '商品价格',`sku_num` bigint COMMENT '商品数量',`create_time` string COMMENT '创建时间'
) COMMENT '订单详情表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED ASINPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_order_detail/';
SKU商品表(全量表)
drop table if exists ods_sku_info;
create external table ods_sku_info( `id` string COMMENT 'skuId',`spu_id` string COMMENT 'spuid', `price` decimal(10,2) COMMENT '价格',`sku_name` string COMMENT '商品名称',`sku_desc` string COMMENT '商品描述',`weight` string COMMENT '重量',`tm_id` string COMMENT '品牌id',`category3_id` string COMMENT '品类id',`create_time` string COMMENT '创建时间'
) COMMENT 'SKU商品表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED ASINPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_sku_info/';
用户表(增量及更新)
drop table if exists ods_user_info;
create external table ods_user_info( `id` string COMMENT '用户id',`name` string COMMENT '姓名',`birthday` string COMMENT '生日',`gender` string COMMENT '性别',`email` string COMMENT '邮箱',`user_level` string COMMENT '用户等级',`create_time` string COMMENT '创建时间',`operate_time` string COMMENT '操作时间'
) COMMENT '用户表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED ASINPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_user_info/';
商品一级分类表(全量)
drop table if exists ods_base_category1;
create external table ods_base_category1( `id` string COMMENT 'id',`name` string COMMENT '名称'
) COMMENT '商品一级分类表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED ASINPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_base_category1/';
支付流水表(增量)
drop table if exists ods_payment_info;
create external table ods_payment_info(`id` bigint COMMENT '编号',`out_trade_no` string COMMENT '对外业务编号',`order_id` string COMMENT '订单编号',`user_id` string COMMENT '用户编号',`alipay_trade_no` string COMMENT '支付宝交易流水编号',`total_amount` decimal(16,2) COMMENT '支付金额',`subject` string COMMENT '交易内容',`payment_type` string COMMENT '支付类型',`payment_time` string COMMENT '支付时间'
) COMMENT '支付流水表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED ASINPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_payment_info/';
省份表(特殊)
drop table if exists ods_base_province;
create external table ods_base_province (`id` bigint COMMENT '编号',`name` string COMMENT '省份名称',`region_id` string COMMENT '地区ID',`area_code` string COMMENT '地区编码',`iso_code` string COMMENT 'iso编码') COMMENT '省份表'
row format delimited fields terminated by '\t'
STORED ASINPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_base_province/';
地区表(特殊)
drop table if exists ods_base_region;
create external table ods_base_region (`id` bigint COMMENT '编号',`region_name` string COMMENT '地区名称') COMMENT '地区表'
row format delimited fields terminated by '\t'
STORED ASINPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_base_region/';
drop table if exists ods_base_province;
create external table ods_base_province (`id` bigint COMMENT '编号',`name` string COMMENT '省份名称',`region_id` string COMMENT '地区ID',`area_code` string COMMENT '地区编码',`iso_code` string COMMENT 'iso编码') COMMENT '省份表'
row format delimited fields terminated by '\t'
STORED ASINPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_base_province/';
订单状态表(增量)
drop table if exists ods_order_status_log;
create external table ods_order_status_log (`id` bigint COMMENT '编号',`order_id` string COMMENT '订单ID',`order_status` string COMMENT '订单状态',`operate_time` string COMMENT '修改时间'
) COMMENT '订单状态表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED ASINPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_order_status_log/';
SPU商品表(全量)
drop table if exists ods_spu_info;
create external table ods_spu_info(`id` string COMMENT 'spuid',`spu_name` string COMMENT 'spu名称',`category3_id` string COMMENT '品类id',`tm_id` string COMMENT '品牌id'
) COMMENT 'SPU商品表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED ASINPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_spu_info/';
退单表(增量)
drop table if exists ods_order_refund_info;
create external table ods_order_refund_info(`id` string COMMENT '编号',`user_id` string COMMENT '用户ID',`order_id` string COMMENT '订单ID',`sku_id` string COMMENT '商品ID',`refund_type` string COMMENT '退款类型',`refund_num` bigint COMMENT '退款件数',`refund_amount` decimal(16,2) COMMENT '退款金额',`refund_reason_type` string COMMENT '退款原因类型',`create_time` string COMMENT '退款时间'
) COMMENT '退单表'
PARTITIONED BY (`dt` string)
row format delimited fields terminated by '\t'
STORED ASINPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
location '/warehouse/gmall/ods/ods_order_refund_info/';
ODS层加载数据脚本
1)创建脚本hdfs_to_ods_db.sh
#!/bin/bashAPP=gmall
hive=/opt/module/hive/bin/hive# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$2" ] ;thendo_date=$2
else do_date=`date -d "-1 day" +%F`
fisql1="
load data inpath '/origin_data/$APP/db/order_info/$do_date' OVERWRITE into table ${APP}.ods_order_info partition(dt='$do_date');load data inpath '/origin_data/$APP/db/order_detail/$do_date' OVERWRITE into table ${APP}.ods_order_detail partition(dt='$do_date');load data inpath '/origin_data/$APP/db/sku_info/$do_date' OVERWRITE into table ${APP}.ods_sku_info partition(dt='$do_date');load data inpath '/origin_data/$APP/db/user_info/$do_date' OVERWRITE into table ${APP}.ods_user_info partition(dt='$do_date');load data inpath '/origin_data/$APP/db/payment_info/$do_date' OVERWRITE into table ${APP}.ods_payment_info partition(dt='$do_date');load data inpath '/origin_data/$APP/db/base_category1/$do_date' OVERWRITE into table ${APP}.ods_base_category1 partition(dt='$do_date');load data inpath '/origin_data/$APP/db/base_category2/$do_date' OVERWRITE into table ${APP}.ods_base_category2 partition(dt='$do_date');load data inpath '/origin_data/$APP/db/base_category3/$do_date' OVERWRITE into table ${APP}.ods_base_category3 partition(dt='$do_date'); load data inpath '/origin_data/$APP/db/base_trademark/$do_date' OVERWRITE into table ${APP}.ods_base_trademark partition(dt='$do_date'); load data inpath '/origin_data/$APP/db/activity_info/$do_date' OVERWRITE into table ${APP}.ods_activity_info partition(dt='$do_date'); load data inpath '/origin_data/$APP/db/activity_order/$do_date' OVERWRITE into table ${APP}.ods_activity_order partition(dt='$do_date'); load data inpath '/origin_data/$APP/db/cart_info/$do_date' OVERWRITE into table ${APP}.ods_cart_info partition(dt='$do_date'); load data inpath '/origin_data/$APP/db/comment_info/$do_date' OVERWRITE into table ${APP}.ods_comment_info partition(dt='$do_date'); load data inpath '/origin_data/$APP/db/coupon_info/$do_date' OVERWRITE into table ${APP}.ods_coupon_info partition(dt='$do_date'); load data inpath '/origin_data/$APP/db/coupon_use/$do_date' OVERWRITE into table ${APP}.ods_coupon_use partition(dt='$do_date'); load data inpath '/origin_data/$APP/db/favor_info/$do_date' OVERWRITE into table ${APP}.ods_favor_info partition(dt='$do_date'); load data inpath '/origin_data/$APP/db/order_refund_info/$do_date' OVERWRITE into table ${APP}.ods_order_refund_info partition(dt='$do_date'); load data inpath '/origin_data/$APP/db/order_status_log/$do_date' OVERWRITE into table ${APP}.ods_order_status_log partition(dt='$do_date'); load data inpath '/origin_data/$APP/db/spu_info/$do_date' OVERWRITE into table ${APP}.ods_spu_info partition(dt='$do_date'); load data inpath '/origin_data/$APP/db/activity_rule/$do_date' OVERWRITE into table ${APP}.ods_activity_rule partition(dt='$do_date'); load data inpath '/origin_data/$APP/db/base_dic/$do_date' OVERWRITE into table ${APP}.ods_base_dic partition(dt='$do_date');
"sql2="
load data inpath '/origin_data/$APP/db/base_province/$do_date' OVERWRITE into table ${APP}.ods_base_province;load data inpath '/origin_data/$APP/db/base_region/$do_date' OVERWRITE into table ${APP}.ods_base_region;
"
case $1 in
"first"){
#$hive -e "$sql1"
#$hive -e "$sql2"sudo -u hive hive -e "$sql1"sudo -u hive hive -e "$sql2"
};;
"all"){
#$hive -e "$sql1"sudo -u hive hive -e "$sql1"
};;
esac
拿order_info举例,因为Sqoop导入的数据:
所以我们导入数据的时候它会按照格式进行按坑就位~:
LOAD DATA INPATH '/origin_data_cdh/gmall/db/order_info/2020-11-19'
overwrite into table test.ods_order_info_test PARTITION (dt='2020-11-19');
这篇关于二、数据仓库电商项目——ODS层的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!