原数据格式 :
gid id score
a1 1 90
a1 2 80
a1 3 79
a1 4 80
a2 1 79
a2 3 89
a3 2 45
a3 4 57
a4 3 56
a5 3 89
......
要把数据变成:
gid id_1 id_2 id_3 id_4 score
a1 90 80 79 80
a2 79 0 89 0
a3 0 45 0 57
.......
数据量很大,上百亿条,所以用hive与spark结合的形式,在hive中见表,先把每个gid 的不同id都合并成一行,然后输入spark中进行分割处理成多列。
一、hive先将每个gid的多行数据合并成一行
create table a_id_score_concat as
select a.gid,concat_ws(',',collect_set(concat(a.model_id,'=',a.score))) as score #把model_id 与score用等号合并起来,便于后面拆开后匹配,也能解决每个gid的id个数不等的问题from a_id_score a group by a.gid;
输出形式:
a1 1=90,2=80,3=79,4=80
a2 1=79,3=89
二、用spark的函数,把合并的几行再分成多列,本质是python函数
from pyspark.sql import SQLContext,Row
from pyspark.mllib.regression import LabeledPointfrom pyspark import SparkContext, SparkConffrom pyspark.sql import HiveContextfrom pyspark.mllib.linalg import SparseVector, DenseVector
df2=sqlContext.sql("select * from a_id_score_concat")
def splits(lt1):
lt0=['null','10101','10102','10103','10105','10106','10121','10122','10123','10125','10126','10201','10221'] #id列表#需要合并的id,加一个null是为了跟out的列表长度一致,后面索引好匹配。
out=['na','0','0','0','0','0','0','0','0','0','0','0','0'] #拆分出来的矩阵格式 列数据初始化,没有的id位置默认为0 gid=lt1[0] sco=lt1[1].split(",") #取出合并的score out[0]=gid for i in sco: s1=i.split("=") #把每个等式拆开 index1=lt0.index(s1[0]) #找索引 out[index1]=s1[1] #在索引对应列放入数据 return out
df3=sqlContext.createDataFrame(df2.map(splits,["gid",'m10101','m10102','m10103','m10105','m10106','m10121','m10122','m10123','m10125','m10126','m10201','m10221'])
#保存dataframe,数据输出df3.saveAsTable(tableName="id_scores",source="parquet",mode="overwrite")
hive中建表保存:
hadoop fs -mv hdfs:./xiaofei_model_installed_pkgs hdfs:./zhangb.db
# 在hive中建表语句
create external table aaaaaa (pkg string,cnt01 bigint,cnt11 bigint,xsb double,chi2 double,gain double,iv double,rank1 int ,rank2 int ,rank3 int ,rank int )ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'LOCATION'hdfs:./zhangb.db/id_scores';