1. 需求目的:

约5千万条出租车gps数据存储在mysql中,数据按照一定时间频率记录,想要获取出其中的订单数量。

(数据示例)

出租车数据挖掘案例实战_出租车数据挖掘案例实战

2. 思路分析

plate_number是车牌号,state=1表示车上有人。数据先按照车辆分组,然后按时间排序,最后选出连续的序列state=1作为一个订单

3. 实践

3.1 sql合并数据

由于数据在mysql中被分成了200+个表,有两种处理方式:

  1. 分别处理每个表获得每个车牌号数据,再按车牌号合并
  2. 先把表合并成大表,直接整体处理获得每个车牌号数据(used in my project

2021.10.30
需求:一天的出租车轨迹表被成了几百个表,需要将这些表合并
思路:读出这个数据库中所有的表名,循环将所有表合并到一个新表中(使用存储过程)
学习过程

  1. 两个表垂直拼接的方法
# 两个表之间的连接查询,但是会删除两个表中的重复数据`
select * from table union select * from table2
两个表之间的连接查询,不删除两个表中的重复数据`
select * from table union all select * from table2

comment:这样子写200+个句子不不现实,应该使用循环或者直接获取数据库中的表名

存储过程

DELIMITER $$

USE `taxigps`$$

DROP PROCEDURE IF EXISTS `test11`$$

CREATE DEFINER=`root`@`%` PROCEDURE `test11`()
BEGIN
        DECLARE stopflag INT DEFAULT 0;
        DECLARE tablename VARCHAR(64);
        -- 创建一个游标变量,declare 变量名 cursor ...
        DECLARE tablename_cur CURSOR FOR SELECT table_name FROM information_schema.tables WHERE table_schema='taxigps' AND table_name != 'taxi';
        -- 游标是保存查询结果的临时区域
        -- 游标变量username_cur保存了查询的临时结果,实际上就是结果集
        -- 当游标变量中保存的结果都查询一遍(遍历),到达结尾,将变量stopflag设置为1,用于循环中判断是否结束
        DECLARE CONTINUE HANDLER FOR NOT FOUND SET stopflag=1;
 
        OPEN tablename_cur; -- 打卡游标
        FETCH tablename_cur INTO tablename; -- 游标向前走一步,取出一条记录放到变量username中
#        SELECT tablename,000;
        WHILE(stopflag=0) DO -- 如果游标还没有结尾,就继续
            BEGIN 
                -- 在用户名前门拼接 '_cur' 字符串
                SET @sql = CONCAT('insert into taxi SELECT * FROM ',tablename);
		PREPARE ss FROM @sql;
		EXECUTE ss;
                
                #INSERT INTO taxi SELECT * FROM trablename;
                FETCH tablename_cur INTO tablename;
            END;
        END WHILE; -- 结束循环
        CLOSE tablename_cur; -- 关闭游标
	END$$

DELIMITER ;

3.2 按车牌号分组

"""
@author:HY
@time:2021/10/31:14:51
"""
from threading import Thread
import threading
from time import sleep, ctime
import pandas as pd
import time
import pickle


def solve_file(name, file):
    for index, row in file.iterrows():
        date, taxi_time, _, plate_number, lng, lat, _, _, state, _ = row
        if plate_number in taxi_dic.keys():
            taxi_dic[plate_number][taxi_time] = state
        else:
            taxi_dic[plate_number] = {taxi_time: state}


df = pd.read_csv('test.csv', header=None, sep='\t')
# df = pd.read_csv('20160920_taxigps.csv', header=None, sep='\t')

line_num = len(df)
taxi_dic = {}
time1 = time.time()

thread_num = 1
for i in range(thread_num):
    i = i + 1
    exec('divide_file' + str(i) + '  = int(line_num * ' + str(i) + '  / thread_num)')
    if i == 1:
        exec('df' + str(i) + ' = df[divide_file1:]')
    elif i == thread_num:
        exec('df' + str(i) + ' = df[:divide_file' + str(thread_num) + ']')
    else:
        exec('df' + str(i) + ' = df[divide_file' + str(i-1) + ':divide_file' + str(i) + ']')


for i in range(thread_num):
    exec('t' + str(i) + ' = threading.Thread(target=solve_file, args=('+ str(i) + ', df'+ str(i) + '))')
    exec('t' + str(i) + '.start()')
    exec('t' + str(i) + '.join()')

time2 = time.time()
with open('多线程-车辆数据字典.pickle', 'wb') as f:
    pickle.dump(taxi_dic, f)
taxi_dic.clear()
time3 = time.time()
solve_file('all',df)
time4 = time.time()
with open('单线程-车辆数据字典.pickle', 'wb') as f:
    pickle.dump(taxi_dic, f)
print(f'多线程聚合耗时:time={time2-time1},单线程聚合耗时:time={time4-time3}')

3.3 生成订单数据

需求:3.2获得了每个车辆的数据,对于每个车辆这里需要按时间排序,然后生成一个订单
思路:多线程遍历每个车,寻找连续的state=1序列(多线程没单线程快…)
学习过程

  1. 线程的暂停与恢复
  2. 关于锁:一个锁锁一个东西,这个东西释放了,别人才能拿到这个锁
    锁的描述
  3. 字典按照key排序:
sort_date = sorted(data.items(), key=lambda item: item[0])
"""
@author:HY
@time:2021/11/1:10:34
"""
import pickle
import pandas as pd
from tqdm import tqdm
import threading


class Node:
    def __init__(self, value):
        self.value = value
        self.next = None


class Queue:
    """
    队列用于存储每个线程要处理的数据
    """
    def __init__(self):
        self.head = None
        self.tail = None
        self.length = 0

    def pop(self):
        if self.head is None: return None
        a = self.head
        if self.head.next is None:
            self.head = self.tail = None
        else:
            self.head = self.head.next
        self.length -= 1
        return a

    def push(self, node):
        if self.head is None:
            self.head = self.tail = node
        else:
            self.tail.next = node
            self.tail = node

        self.length += 1


class Request:
    def __init__(self, start_time, s_lng, s_lat):
        self.start_time = start_time
        self.s_lng = s_lng
        self.s_lat = s_lat
        self.e_lng = None
        self.e_lat = None
        self.end_time = None

class MY_Server(threading.Thread):
    def __init__(self, name, value=Queue()):
        threading.Thread.__init__(self)
        self.mName = name
        self.mEvent = threading.Event()
        self.data_queue = value
        self.is_running = False
        self.recieving = True

    def run(self):
        while self.recieving or self.data_queue.length > 0:  # 逐个车遍历。当该线程还在接受车辆或者队列还有的数据的时候都不停遍历

            data = self.data_queue.pop()  # data是一个列表,每个元素是[time, state,lng,lat]
            while data is None and self.recieving:
                self.is_running = False
                data = self.data_queue.pop()
            if data is None: break
            self.is_running = True

            # 排序字典:按时间排序

            sort_date = sorted(data.value, key=lambda item: item[0])
            vehicle_req_list = self.get_request(sort_date)

            # 将该车的订单加入列表
            lock.acquire()
            req_list.extend(vehicle_req_list)
            lock.release()

    def get_request(self, sort_data):
        """
        sort_data是一个列表,每个元素是一个列表 now, state, lng, lat
        """
        last = [None, 0, None, None]
        request_list = []
        one_request = None
        for s in sort_data:
            now, state, lng, lat = s  # 此刻状态
            last_now, last_state, last_lng, last_lat = last  # 之前状态
            if last_state == 0:  # 之前是没有人的状态,找1建立request
                if state == 0:
                    continue  # 中间的0全部跳过
                elif state == 1:  # 找到了一个首字母1,建立request并且last记录为当前
                    one_request = Request(now, lng, lat)
                    last = [now, state, lng, lat]
            elif last_state == 1:  # 有人的状态,找0前面的1
                if state == 1:  # 不断记录直到最后一个
                    last = [now, state, lng, lat]
                elif state == 0:
                    one_request.end_time = last_now
                    one_request.e_lng = last_lng
                    one_request.e_lat = last_lat
                    request_list.append(one_request)
        return request_list

    def pause(self):
        self.mEvent.clear()

    def resume(self):
        self.mEvent.set()

import time
if __name__ == "__main__":
    time1 = time.time()
    # 数据获取
    # all_dic = {'a': {30: 1, 40: 0, 50: 1}, 'b': {30: 1, 40: 0, 50: 0}}
    with open('多线程-车辆数据字典.pickle', 'rb') as f:
        all_dic = pickle.load(f)

    # 生成一个锁对象
    lock = threading.Lock()
    # 结果集
    req_list = []

    # 线程数量
    thread_num = 1

    # 开启线程
    # t0 = MY_Server(0)
    # t0.start()
    #
    # t1 = MY_Server(1)
    # t1.start()
    #
    # t2 = MY_Server(2)
    # t2.start()

    for i in range(thread_num):  # 12个线程
        exec('t' + str(i) + ' = MY_Server(i)')
        exec('t' + str(i) + '.start()')

    # 遍历数据输入给每个线程
    taxi_num = 0
    for _, value in all_dic.items():  # value是字典,key为时间,value为别的
        turn_to_whom = taxi_num % thread_num
        n = Node(value)
        exec('t' + str(turn_to_whom) + '.data_queue.push(n)')

    # 告诉所有线程数据传送完毕
    for i in range(thread_num):  # 12个线程
        exec('t' + str(i) + '.recieving = False')

    # join()
    for i in range(thread_num):  # 12个线程
        exec('t' + str(i) + '.join()')

    time2 = time.time()
    print(len(req_list))
    print('耗时', time2 - time1)
    # 保存订单数据
    with open('shenzhen_req.pickle', 'wb') as f:
        pickle.dump(req_list, f)

最后发现不如单线程跑得快,因此最终程序为

"""
@author:HY
@time:2021/11/1:16:51
"""
import pandas as pd
import time
import pickle
from tqdm import tqdm
import datetime

class Request:
    def __init__(self, start_time, s_lng, s_lat):
        self.start_time = start_time
        self.s_lng = s_lng
        self.s_lat = s_lat
        self.e_lng = None
        self.e_lat = None
        self.end_time = None
def solve_file(name, file):
    taxi_dic = {}
    for index, row in tqdm(file.iterrows(), desc='处理成逐个车辆'):
        date, taxi_time, _, plate_number, lng, lat, _, _, state, _ = row
        if plate_number in taxi_dic.keys():
            taxi_dic[plate_number].append([taxi_time, state, lng, lat])
        else:
            taxi_dic[plate_number] = [[taxi_time, state, lng, lat],]
    print('车辆数目', len(taxi_dic.keys()))
    with open('数据文件/单线程-车辆数据字典.pickle', 'wb') as f:
        pickle.dump(taxi_dic, f)
    return taxi_dic

def sum_request(taxi_dic):
    req_list = []
    for _, value in tqdm(taxi_dic.items(), desc='处理每个车辆'):
        sort_date = sorted(value, key=lambda item: item[0])
        vehicle_req_list = get_request(sort_date)
        req_list.extend(vehicle_req_list)
    # 保存订单数据
    with open('数据文件/shenzhen_req.pickle', 'wb') as f:
        pickle.dump(req_list, f)

def get_request(sort_data):
        """
        sort_data是一个列表,每个元素是一个列表 now, state, lng, lat
        """
        last = [None, 0, None, None]
        request_list = []
        one_request = None
        for s in sort_data:
            now, state, lng, lat = s  # 此刻状态
            last_now, last_state, last_lng, last_lat = last  # 之前状态
            if last_state == 0:  # 之前是没有人的状态,找1建立request
                if state == 0:
                    continue  # 中间的0全部跳过
                elif state == 1:  # 找到了一个首字母1,建立request并且last记录为当前
                    one_request = Request(now, lng, lat)
                last = [now, state, lng, lat]
            elif last_state == 1:  # 有人的状态,找0前面的1
                if state == 0:
                    one_request.end_time = last_now
                    one_request.e_lng = last_lng
                    one_request.e_lat = last_lat
                    request_list.append(one_request)
                last = [now, state, lng, lat]
        last_now, last_state, last_lng, last_lat = last  # 之前状态
        if last_state == 1:
            one_request.end_time = last_now
            one_request.e_lng = last_lng
            one_request.e_lat = last_lat
            request_list.append(one_request)
        return request_list

def get_file():
    """
    处理原文件获得订单
    :return:
    """
    time1 = time.time()
    df = pd.read_csv('20160920_taxigps.csv', header=None, sep='\t')
    time2 = time.time()
    taxi_dic = solve_file('all', df)
    time3 = time.time()
    sum_request(taxi_dic)
    time4 = time.time()

    print(f'读数据时间{time2-time1},处理成单车辆时间{time3-time2},统计订单时间{time4-time3}')