最近常常和Json打交道,记录一下Spark解析Json案例,数据有点长
数据示例,下面是一条数据:
{"status":"1","regeocode":{"roads":[{"id":"0571H51F02100373","location":"120.349,30.303","direction":"东","name":"5号大街","distance":"200.895"},{"id":"0571H51F0210035161","location":"120.349,30.3039","direction":"东","name":"西部生活区南区4号路","distance":"204.745"},{"id":"0571H51F021004192","location":"120.351,30.3016","direction":"北","name":"6号大街","distance":"213.381"}],"roadinters":[{"second_name":"5号大街","first_id":"0571H51F0210035161","second_id":"0571H51F02100373","location":"120.3488517,30.30389417","distance":"210.63","first_name":"西部生活区南区4号路","direction":"东"}],"formatted_address":"浙江省杭州市江干区白杨街道浙江育英职业技术学院继续教育分院浙江育英职业技术学院","addressComponent":{"city":"杭州市","province":"浙江省","adcode":"330104","district":"江干区","towncode":"330104009000","streetNumber":{"number":"818","location":"120.351234,30.3021814","direction":"南","distance":"149.438","street":"6号大街"},"country":"中国","township":"白杨街道","businessAreas":[{"location":"120.363533,30.305467","name":"白杨","id":"330104"},{"location":"120.35041,30.308644","name":"下沙","id":"330104"}],"building":{"name":[],"type":[]},"neighborhood":{"name":[],"type":[]},"citycode":"0571"},"aois":[{"area":"100617.469379","type":"141201","id":"B023B01B42","location":"120.351458,30.304785","adcode":"330104","name":"浙江育英职业技术学院","distance":"0"}],"pois":[{"id":"B023B01B42","direction":"北","businessarea":"白杨","address":"4号大街","poiweight":"0.583587","name":"浙江育英职业技术学院","location":"120.351458,30.304785","distance":"148.443","tel":"0571-86912828;0571-86877012;0571-86911032","type":"科教文化服务;学校;高等院校"},{"id":"B0FFF0E5ZZ","direction":"东","businessarea":"白杨","address":"下沙6号大街杭州市实验外国语学校(高中部)附近","poiweight":"0.317653","name":"浙江育英职业技术学院继续教育分院","location":"120.351882,30.303522","distance":"84.6825","tel":[],"type":"科教文化服务;学校;成人教育|科教文化服务;学校;职业技术学校"},{"id":"B023B07I8N","direction":"东","businessarea":"白杨","address":"下沙6号大街","poiweight":"0.410008","name":"杭州市实验外国语学校(高中部)","location":"120.352256,30.303617","distance":"121.179","tel":[],"type":"科教文化服务;学校;中学"},{"id":"B0FFGATJJD","direction":"东南","businessarea":"白杨","address":"下沙六号路精欧荣寓3幢","poiweight":"0.185757","name":"大胜画室","location":"120.352035,30.302698","distance":"134.26","tel":[],"type":"科教文化服务;培训机构;培训机构"},{"id":"B0FFFTQGCG","direction":"西南","businessarea":"白杨","address":"五号路六号路口","poiweight":"0.238657","name":"橄榄树幼儿园(5号大街)","location":"120.350327,30.302496","distance":"129.949","tel":"15168361016","type":"科教文化服务;学校;幼儿园"},{"id":"B023B0BVSW","direction":"东南","businessarea":"白杨","address":"6号大街880附近","poiweight":"0.461738","name":"中国邮政储蓄银行(下沙营业所)","location":"120.353033,30.302344","distance":"234.335","tel":"0571-85160745","type":"金融保险服务;银行;中国邮政储蓄银行"},{"id":"B023B0A5ND","direction":"东","businessarea":"白杨","address":"经济技术开发区9号大街9号","poiweight":"0.699491","name":"浙江中医药大学附属第一医院下沙院区","location":"120.353589,30.304031","distance":"255.214","tel":"0571-86919300;0571-87010630;0571-86919388","type":"医疗保健服务;综合医院;综合医院"},{"id":"B023B14BVN","direction":"南","businessarea":"白杨","address":"下沙5号路与6号大街交汇处","poiweight":"0.353768","name":"精欧荣寓","location":"120.35123,30.302559","distance":"108.044","tel":[],"type":"商务住宅;住宅区;住宅小区"},{"id":"B023B0A5NG","direction":"东","businessarea":"白杨","address":"9号大街9号","poiweight":"0.526879","name":"浙江省中医院下沙院区","location":"120.353747,30.304354","distance":"279.921","tel":"0571-86911001;0571-86919388","type":"医疗保健服务;综合医院;三级甲等医院"},{"id":"B023B14VPD","direction":"东","businessarea":"白杨","address":"经济技术开发区9号大街9号(近物美下沙店)","poiweight":"0.633546","name":"浙江中医药大学附属第一医院-浙江省东方医院","location":"120.353747,30.304354","distance":"279.921","tel":"0571-86918600","type":"医疗保健服务;综合医院;综合医院"},{"id":"B023B18N7C","direction":"西","businessarea":"白杨","address":"下沙经济技术开发区4号大街与3号街交叉口裕园公寓9幢二楼","poiweight":"0.144617","name":"思韵舞蹈(盛泰名都校区)","location":"120.348332,30.303994","distance":"261.74","tel":"15314614090;18989488225","type":"科教文化服务;培训机构;培训机构"},{"id":"B0FFF0E7FF","direction":"东","businessarea":"白杨","address":"下沙6号路四季名门西侧842号","poiweight":"0.139271","name":"薛承峰书画工作室","location":"120.353533,30.302519","distance":"266.998","tel":[],"type":"科教文化服务;科教文化场所;科教文化场所"},{"id":"B0FFFWYQQ2","direction":"西","businessarea":"白杨","address":"下沙经济开发区5号大街四季风景苑会所二楼","poiweight":"0.186142","name":"杭州龙源青少年培训中心","location":"120.348455,30.302538","distance":"267.174","tel":"0571-88013295","type":"科教文化服务;培训机构;培训机构"},{"id":"B023B1AG6E","direction":"东北","businessarea":"白杨","address":"下沙4号大街9号浙江育英职业技术学院1411室","poiweight":"0.36555","name":"朝日日语(下沙校区)","location":"120.353025,30.305489","distance":"293.63","tel":"0571-85094699;0571-85094688;15395719324","type":"科教文化服务;培训机构;培训机构"},{"id":"B023B14LLE","direction":"南","businessarea":"白杨","address":"8号大街13号","poiweight":"0.271788","name":"北房工业园区","location":"120.351775,30.300585","distance":"333.662","tel":[],"type":"商务住宅;产业园区;产业园区"},{"id":"B0FFFVZ69J","direction":"西北","businessarea":"白杨","address":"5号大街297号","poiweight":"0.48","name":"盛泰开元名都大酒店","location":"120.348073,30.305443","distance":"353.791","tel":"0571-88279999","type":"住宿服务;宾馆酒店;五星级宾馆"},{"id":"B023B187KX","direction":"西北","businessarea":"白杨","address":"下沙经济技术开发区4号大街505号","poiweight":"0.48","name":"上品折扣(西子·阳光星城1幢南)","location":"120.34807,30.305445","distance":"354.155","tel":"0571-86932539","type":"购物服务;商场;普通商场"},{"id":"B023B0AAWY","direction":"北","businessarea":"白杨","address":"下沙开发区4号大街17-6号","poiweight":"0.33749","name":"白杨街道办事处","location":"120.351197,30.306567","distance":"340.458","tel":"0571-86912111","type":"政府机构及社会团体;政府机关;乡镇级政府及事业单位"},{"id":"B023B14PA1","direction":"北","businessarea":"白杨","address":"下沙开发区4号大街17-6号","poiweight":"0.156572","name":"江干区人大常委会白杨街道工作委员会","location":"120.351197,30.306567","distance":"340.458","tel":[],"type":"政府机构及社会团体;政府机关;乡镇级政府及事业单位"},{"id":"B0FFFZYCNB","direction":"北","businessarea":"白杨","address":"下沙开发区4号大街17-6号","poiweight":"0.146109","name":"中共白杨街道工作委员会","location":"120.351197,30.306567","distance":"340.458","tel":[],"type":"政府机构及社会团体;政府机关;乡镇级政府及事业单位"},{"id":"B023B0A7OW","direction":"西南","businessarea":"白杨","address":"六号大街968号","poiweight":"0.47408","name":"格林联盟酒店6号大街店(南门)","location":"120.350177,30.301654","distance":"221.009","tel":[],"type":"通行设施;通行设施;通行设施"},{"id":"B023B18K71","direction":"东","businessarea":"白杨","address":"经济技术开发区9号大街9号浙江省中医院下沙院区内","poiweight":"0.126103","name":"浙江省中医院下沙院区门诊部","location":"120.353796,30.304608","distance":"294.908","tel":"0571-86918600;0571-86911001","type":"医疗保健服务;综合医院;综合医院"},{"id":"B023B14FW7","direction":"西","businessarea":"白杨","address":"西部生活区南区1号路东侧","poiweight":"0.32391","name":"盛泰名都公寓","location":"120.347984,30.304409","distance":"306.331","tel":[],"type":"商务住宅;住宅区;住宅小区"},{"id":"B0FFFFKR8U","direction":"北","businessarea":"白杨","address":"下沙开发区4号大街17-6号","poiweight":"0.160793","name":"白杨街道流动人口服务管理中心","location":"120.351120,30.306500","distance":"332.688","tel":[],"type":"政府机构及社会团体;政府机关;乡镇以下级政府及事业单位"},{"id":"B0FFF00OF8","direction":"北","businessarea":"白杨","address":"下沙开发区4号大街17-6号","poiweight":"0.148719","name":"白杨街道社会服务管理中心","location":"120.351197,30.306567","distance":"340.458","tel":[],"type":"政府机构及社会团体;政府机关;乡镇级政府及事业单位"},{"id":"B023B18L3G","direction":"东","businessarea":"白杨","address":"6号大街842-3号(与9号大街交汇处,邮政局东侧)","poiweight":"0.195906","name":"笑傲旅馆","location":"120.353535,30.302660","distance":"261.087","tel":"0571-86932770","type":"住宿服务;旅馆招待所;旅馆招待所"},{"id":"B023B19VD0","direction":"东北","businessarea":"白杨","address":"经济技术开发区9号大街9号浙江省中医院下沙院区内","poiweight":"0.144617","name":"浙江省中医院下沙院区急诊","location":"120.353972,30.304755","distance":"317.137","tel":[],"type":"医疗保健服务;综合医院;综合医院"},{"id":"B023B0A7DV","direction":"东","businessarea":"白杨","address":"9号大街以西","poiweight":"0.330247","name":"四季名门城市公寓","location":"120.354084,30.302499","distance":"316.704","tel":[],"type":"商务住宅;住宅区;住宅小区"},{"id":"B023B14P9V","direction":"北","businessarea":"白杨","address":"下沙开发区4号大街17-6号","poiweight":"0.148719","name":"中共白杨街道纪律检查工作委员会","location":"120.351197,30.306567","distance":"340.458","tel":[],"type":"政府机构及社会团体;政府机关;乡镇级政府及事业单位"},{"id":"B0FFGATJBD","direction":"西","businessarea":"白杨","address":"盛泰名都公寓2幢4号","poiweight":"0.192243","name":"唯雅口腔","location":"120.347280,30.303878","distance":"359.48","tel":"0571-88375558;0571-88375556","type":"医疗保健服务;专科医院;口腔医院"}]},"info":"OK","infocode":"10000"}
代码
用了三种数据结构,就是想看看有什么算子的用法区别,然而其实本质上都是map形式的并没什么区别
package com
import com.alibaba.fastjson.{JSON, JSONObject}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/** 需求:
* json数据归纳格式(考虑获取到数据的成功因素 status=1成功 starts=0 失败):
* 1、按照pois,分类businessarea,并统计每个businessarea的总数。
* 2、按照pois,分类type,为每一个Type类型打上标签,统计各标签的数量
* 标签结果格式(标签名称,count值)
*
* @author 小羊羊成长之路
* @create 2019-12-06-13:58
*/
object test {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("pois")
conf.setMaster("local[*]")
val sc = new SparkContext(conf)
val textFile: RDD[String] = sc.textFile("E:\\json.log")
//读取status=1成功获取的数据
val file = textFile.filter(rdd => {
//将每行数据转换成JSON对象
val nObject = JSON.parseObject(rdd)
nObject.get("status").toString == "1"
})
//缓存 多次复用
file.cache()
/**
* 1、按照pois,分类businessarea,并统计每个businessarea的总数。
*
* 先读取每行pois(pois属于regeocode下),再找出对应的businessarea,每条标记1
*/
file.map(rdd => {
val nObject = JSON.parseObject(rdd)
val pois = nObject.getJSONObject("regeocode").getJSONArray("pois")
val iter = pois.iterator()
//计算包含businessarea字段的个数
val inext: String = iter.next().toString
val businessarea = JSON.parseObject(inext).get("businessarea")
//如果businessarea为空,去掉
(businessarea.toString, pois.size())
}).filter(map=>{
//去掉空的[]
!map._1.contains("[]")
}).sortBy(_._2,false,1).foreach(println)
/**
* 2.按照pois,分类type,为每一个Type类型打上标签,统计各标签的数量
* * 标签结果格式(标签名称,count值)
*/
file.map(rdd => {
val nObject = JSON.parseObject(rdd)
val pois = nObject.getJSONObject("regeocode").getJSONArray("pois")
val typeiter = pois.iterator()
//3 list
var list = List[(String,Int)]()
//1 用arrayBuffer
//val arr = new ArrayBuffer[(String,Int)]()
//2 用map
//val map: mutable.HashMap[String, Int] = mutable.HashMap()
while (typeiter.hasNext){
val tnext = typeiter.next().toString
//分割type内的;
val types: Array[String] = JSON.parseObject(tnext).getString("type").split(";")
// 2 map
//for (i <- 0 until types.length){
for (i <- types){
//3 list
list :+= (i,1)
//1 arrayBuffer
//arr.append((v.toString,1))
//2 map
// if (map.contains(types(i))) {
// map += ((types(i),map.get(types(i)).get+1))
// }else{
// map += ((types(i).toString,1))
// }
}
}
(list,null)
}).flatMap(_._1).reduceByKey(_+_).sortBy(_._2.toInt,false,1).foreach(println)
}
}