import cx_Oracle
import pandas as pd
import numpy as np
import os
os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.AL32UTF8' # 读取oracle数据所用编码
import warnings
warnings.filterwarnings('ignore')
def get_data_from_oracle(tableName):
db = cx_Oracle.connect('nfdw','nfdw','172.16.104.120:1521/ora120') # 创建连接
cr = db.cursor()
sql = 'select * from ' + str(tableName)
cr.execute(sql)
colsName = [i[0] for i in cr.description] # 列名
data = pd.DataFrame(cr.fetchall(),columns=colsName) # 返回所有查询结果
return data
#取数据
sg_info=get_data_from_oracle('A_GTF_SG_BASIC_INFO')
B_GTF_SG_RISK_PROJECTS
#选择其中一家
sg_info_4409=sg_info.ix[sg_info['公司编码']=='4409',:]
sg_infoeda_4409=pd.DataFrame(sg_info_4409.isnull().astype(int).sum(axis=0))
#添加公司记录数
sg_infoeda_4409['n']=22088
sg_infoeda_4409.columns=['null','n']
sg_infoeda_4409['null_rate']=sg_infoeda_4409['null']/sg_infoeda_4409['n']
# -*- coding:utf-8 -*-
__author__ = 'Canon'
import cx_Oracle
import pandas as pd
import numpy as np
import time
import datetime
from copy import deepcopy
import warnings
import os
warnings.filterwarnings('ignore')
os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.AL32UTF8' # 读取oracle数据所用编码
# 从oracle获取数据
# 输入:tableName_表名,str
# 输出:data,DataFrame
def get_data_from_oracle(db, tableName):
# db = cx_Oracle.connect('nfdw','nfdw','172.16.104.120:1521/ora120') # 创建连接
cr = db.cursor()
sql = 'select * from ' + str(tableName)
cr.execute(sql)
colsName = [i[0] for i in cr.description] # 列名
data = pd.DataFrame(cr.fetchall(),columns=colsName) # 返回所有查询结果
return data
# 将数据写入oracle数据库
# data: type:dataframe or array 需要写入的数据
# db: type:cx_Oracle.connection 数据库连接
# tableName: type:string 要写入的表名
def data2DB(data, db, tableName):
columns = list(data.columns)
cr = db.cursor() # 创建游标
dataOracle = list(map(lambda x:dict(zip(columns, x)),np.array(data))) # 将结果转为符合插入数据库的格式dict
# 使用参数形式将数据插入数据库
sql = 'insert into '+ tableName + '('
for col in columns:
sql = sql + col + ','
sql = sql[:-1] + ')'
sql = sql + ' values ('
for col in columns:
sql = sql + ':' + col + ','
sql = sql[:-1] + ')'
cr.executemany(sql, dataOracle) # 执行多条插入
db.commit() # 同步
# 哑变量变换
# 输入:data 原数据 DataFrame
# columnName 进行哑变量变换的列名 str
def dummies(data, columnName):
dummiesData = pd.get_dummies(data[columnName], prefix=columnName) # 哑变量变换
data[dummiesData.columns] = dummiesData # 哑变量后赋值给原数据
del data[columnName] # 删去原来的那一列
return data
# 将数据处理成符合建模的格式
# data: 特征数据
# colsToDummiersList: 需要进行哑变量变换的属性列表
def transform_data(data, colsToDummiersList):
for col in colsToDummiersList:
data = dummies(data, col) # 哑变量变换
return data
import matplotlib.pyplot as plt
#绘制直方图
def drawHist(heights, xTitle, yTitle, bins = 100):
#创建直方图
#第一个参数为待绘制的定量数据,不同于定性数据,这里并没有事先进行频数统计
#第二个参数为划分的区间个数
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
plt.hist(heights, bins)
plt.xlabel(xTitle)
plt.ylabel(yTitle)
plt.show()
def get_data():
db = cx_Oracle.connect('nfdw','nfdw','172.16.104.120:1521/ora120') # 创建连接
datetimeNow = pd.to_datetime('2018-06-28') # 用取数时间填充,后期连接数据库后需要修改
dateBegin = pd.to_datetime('1900-01-01')
dateEnd = pd.to_datetime('2018-06-28')
datetimePointBegin1 = pd.to_datetime(dateBegin) # 统计开始时间点1,立项时间
datetimePointBegin2 = pd.to_datetime(dateEnd) - pd.to_timedelta(3, unit='y') # 统计开始时间点2,立项时间 - 3年
datetimePointEnd = pd.to_datetime(dateEnd) # 统计截止时间点,立项时间
headOffice = None
area = None
customerType = None
dataBasicInfo = get_data_from_oracle(db, 'A_GTF_SJ_BASIC_INFO') # 从数据库读取数据,项目基本信息
agreementInfo = get_data_from_oracle(db, 'A_GTF_SJ_AGREEMENT') # 从数据库读取数据,合同信息表
companyCodeTable = get_data_from_oracle(db, 'A_GTF_CODE_COMPANY') # 从数据库读取数据,公司代码对照表
professionCodeTable = get_data_from_oracle(db, 'A_GTF_CODE_PROFESSION') # 从数据库读取数据,专业代码对照表
dataBasicInfo['所属地域'] = dataBasicInfo['所属地域'].fillna('其他')
dataBasicInfo['省份'] = dataBasicInfo['所属地域'].apply(lambda x: x.split('-')[1] if len(x.split('-'))>1 else '其他') # 获取省份
dataBasicInfo['专业公司'] = pd.merge(dataBasicInfo[['项目编号','公司编码']], companyCodeTable[['CODE_HEADOFFICE', 'HEADOFFICE']].drop_duplicates(), left_on='公司编码', right_on='CODE_HEADOFFICE', how='left')['HEADOFFICE']
dataBasicInfo['分公司'] = pd.merge(dataBasicInfo[['项目编号','经营单位CODE']], companyCodeTable[['CODE_BRANCHOFFICE', 'BRANCHOFFICE']].drop_duplicates(), left_on='经营单位CODE', right_on='CODE_BRANCHOFFICE', how='left')['BRANCHOFFICE']
dataBasicInfo['项目部'] = pd.merge(dataBasicInfo[['项目编号','项目部CODE']], companyCodeTable[['CODE_DEPARTMENT', 'DEPARTMENT']].drop_duplicates(), left_on='项目部CODE', right_on='CODE_DEPARTMENT', how='left')['DEPARTMENT']
dataBasicInfo.ix[dataBasicInfo['客户类别一级'] == '海外', '客户类别一级'] = '海外客户' # 把“海外”统一为“海外客户”
dataBasicInfo['专业'] = pd.merge(dataBasicInfo[['项目编号','专业CODE']], professionCodeTable[['CODE_PROFESSION', 'PROFESSION']].drop_duplicates(), left_on='专业CODE', right_on='CODE_PROFESSION', how='left')['PROFESSION']
dataBasicInfo['专业'] = dataBasicInfo['专业'].fillna('其他')
dataBasicInfo['所属地域'] = dataBasicInfo['所属地域'].fillna('未知')
dataBasicInfo['关联交易类型'] = dataBasicInfo['关联交易类型'].fillna('未知')
dataBasicInfo['首次合同签订时间'] = dataBasicInfo['首次合同签订时间'].fillna(datetimeNow)
dataBasicInfo['项目收入关闭日期'] = dataBasicInfo['项目收入关闭日期'].fillna(datetimeNow)
dataBasicInfo['业务关闭时间'] = dataBasicInfo['业务关闭时间'].fillna(datetimeNow)
dataBasicInfo['财务关闭时间'] = dataBasicInfo['财务关闭时间'].fillna(datetimeNow)
dataBasicInfo['最后一次收款时间'] = dataBasicInfo['财务关闭时间'].fillna(datetimeNow)
dataBasicInfo.ix[dataBasicInfo['首次预算收入'].isna(), '首次预算收入'] = dataBasicInfo.ix[dataBasicInfo['首次预算收入'].isna(), '当前预算收入']
dataBasicInfo.ix[dataBasicInfo['首次预算成本'].isna(), '首次预算成本'] = dataBasicInfo.ix[dataBasicInfo['首次预算成本'].isna(), '当前预算成本']
dataBasicInfo.ix[dataBasicInfo['当前预算毛利率'] < -100, '当前预算毛利率'] = -100
dataBasicInfo.ix[dataBasicInfo['当前预算毛利率'] > 100, '当前预算毛利率'] = 100
dataBasicInfo.ix[dataBasicInfo['首次毛利率'] < -100, '首次毛利率'] = -100
dataBasicInfo.ix[dataBasicInfo['首次毛利率'] > 100, '首次毛利率'] = 100
dataBasicInfo.ix[dataBasicInfo['首次毛利率'].isna(), '首次毛利率'] = dataBasicInfo.ix[dataBasicInfo['首次毛利率'].isna(), '当前预算毛利率']
tmpMergeData = pd.merge(dataBasicInfo, agreementInfo, how='left', left_on='项目编号', right_on='主项目编号')
agreementIncome = pd.pivot_table(tmpMergeData[tmpMergeData['合同形式'] != '框架合同'], values='合同签订金额', index='项目编号', aggfunc='sum').fillna(0).reset_index() # 计算合同累计金额,未考虑多对多情况,需要修改
dataBasicInfo = pd.merge(dataBasicInfo, agreementIncome, how='left', on='项目编号')
agreementCount = pd.pivot_table(tmpMergeData[['项目编号', '合同编号']].drop_duplicates(), values='合同编号', index='项目编号', aggfunc='count').fillna(0).reset_index() # 计算项目对应的合同数量
agreementCount.columns = ['项目编号', '合同数量'] # 是否需要剔除框架合同???????
dataBasicInfo = pd.merge(dataBasicInfo, agreementCount, how='left', on='项目编号')
orderContractIncome = pd.pivot_table(tmpMergeData.ix[tmpMergeData['合同形式'] == '订单合同'], values='合同签订金额', index='项目编号', aggfunc='sum').fillna(0).reset_index() # 计算订单合同累计金额,未考虑多对多情况,需要修改
orderContractIncome.columns = ['项目编号', '累计订单合同金额']
dataBasicInfo = pd.merge(dataBasicInfo, orderContractIncome, how='left', on='项目编号')
frameContractIncome = pd.pivot_table(tmpMergeData.ix[tmpMergeData['合同形式'] == '框架合同'], values='合同签订金额', index='项目编号', aggfunc='sum').fillna(0).reset_index() # 计算订单合同累计金额,未考虑多对多情况,需要修改
frameContractIncome.columns = ['项目编号', '累计框架合同金额']
dataBasicInfo = pd.merge(dataBasicInfo, frameContractIncome, how='left', on='项目编号')
dataBasicInfo['自作子项累计工作量'] = dataBasicInfo['自作子项累计工作量'].fillna(0)
dataBasicInfo.ix[dataBasicInfo['自作子项累计工作量'] < 0, '自作子项累计工作量'] = 0 # 负值填充为0
dataBasicInfo['自作子项累计预算收入'] = dataBasicInfo['自作子项累计预算收入'].fillna(0)
dataBasicInfo['自作材料预算成本之和'] = dataBasicInfo['自作材料预算成本之和'].fillna(0)
dataBasicInfo['累计材料成本'] = dataBasicInfo['累计材料成本'].fillna(0)
dataBasicInfo.ix[dataBasicInfo['累计材料成本'] < 0, '累计材料成本'] = 0 # 负值填充为0
dataBasicInfo['分包子项累计工作量'] = dataBasicInfo['分包子项累计工作量'].fillna(0)
dataBasicInfo.ix[dataBasicInfo['分包子项累计工作量'] < 0, '分包子项累计工作量'] = 0 # 负值填充为0
dataBasicInfo['分包子项累计预算收入'] = dataBasicInfo['分包子项累计预算收入'].fillna(0)
dataBasicInfo['分包子项预算成本之和'] = dataBasicInfo['分包子项预算成本之和'].fillna(0)
dataBasicInfo['累计报账分包成本'] = dataBasicInfo['累计报账分包成本'].fillna(0)
dataBasicInfo.ix[dataBasicInfo['累计报账分包成本'] < 0, '累计报账分包成本'] = 0 # 负值填充为0
# 根据输入参数筛选
dataBasicInfo = dataBasicInfo[dataBasicInfo['项目立项完成时间']>=datetimePointBegin1]
dataBasicInfo = dataBasicInfo[(dataBasicInfo['专业公司'] == headOffice) | pd.isnull(headOffice)] # 选择专业公司,默认情况下选择所有专业公司
dataBasicInfo = dataBasicInfo[(dataBasicInfo['省份'] == area) | pd.isnull(area)] # 选择省份,默认情况下选择所有区域
dataBasicInfo = dataBasicInfo[(dataBasicInfo['客户类别一级'] == customerType) | pd.isnull(customerType)] # 选择客户类型,默认情况下选择所有客户类型
data = pd.DataFrame() # 存储最终数据
# 对基本属性做数据预处理
data['PROJECT_CODE'] = dataBasicInfo['项目编号']
data['HEADOFFICE'] = dataBasicInfo['公司编码'] # HEADOFFICE 专业公司
data['BRANCHOFFICE'] = dataBasicInfo['经营单位CODE'] # BRANCHOFFICE 分公司
data['DEPARTMENT'] = dataBasicInfo['项目部CODE'] # DEPARTMENT 项目部
data['项目类别'] = dataBasicInfo['项目类别']
data['关闭状态'] = dataBasicInfo['关闭状态']
data['收入关闭状态'] = dataBasicInfo['收入关闭状态']
# 项目专业,按“-”分割后取不同级别的类别,分割后为空则填充为“其他”
data['项目专业一级'] = dataBasicInfo['专业'].apply(lambda x: x.split('-')[0] if len(x.split('-'))>0 else '其他')
data['项目专业二级'] = dataBasicInfo['专业'].apply(lambda x: x.split('-')[1] if len(x.split('-'))>1 else '其他')
data['项目专业三级'] = dataBasicInfo['专业'].apply(lambda x: x.split('-')[2] if len(x.split('-'))>2 else '其他')
data['业务类型'] = dataBasicInfo['业务类型']
data['是否通服大项目合作'] = dataBasicInfo['是否通服大项目合作']
data['业务层级'] = dataBasicInfo['业务层级']
data['客户类型'] = dataBasicInfo['客户类别一级']
data['项目地点所在省'] = dataBasicInfo['所属地域'].apply(lambda x: x.split('-')[1] if len(x.split('-'))>1 else '未知') # 分割后取省,填充缺失值为“未知”
data['项目地点所在市'] = dataBasicInfo['所属地域'].apply(lambda x: x.split('-')[2] if len(x.split('-'))>2 else '未知') # 分割后取市,填充缺失值为“未知”
data['关联交易类型'] = dataBasicInfo['关联交易类型']
data['是否可外部合作专业'] = dataBasicInfo['是否可外部合作专业']
data['当前预算收入'] = dataBasicInfo['当前预算收入'] # 预算收入
data['预算收入等级'] = 2 # 预算收入等级
data.ix[dataBasicInfo['首次预算收入'] > 2000000, '预算收入等级'] = 1
data.ix[dataBasicInfo['首次预算收入'] < 500000, '预算收入等级'] = 3
dataBasicInfo['项目持续时间'] = (dataBasicInfo['财务关闭时间']-dataBasicInfo['项目立项完成时间']).apply(lambda x: x.days)
data['是否3年以上未关闭'] = (dataBasicInfo['关闭状态'] == '业务关闭') & (dataBasicInfo['项目持续时间'] >= 365*3)
data['是否3年内新立项'] = dataBasicInfo['项目立项完成时间'] >= datetimePointBegin2
data['项目是否关闭'] = dataBasicInfo['关闭状态'] == '业务关闭'
data['累计列账金额'] = dataBasicInfo['项目累计列账收入']
data['累计合同金额'] = dataBasicInfo['合同签订金额']
data['累计订单合同金额'] = dataBasicInfo['累计订单合同金额']
data['累计框架合同金额'] = dataBasicInfo['累计框架合同金额']
data['累计工作量'] = dataBasicInfo['累计工作量']
data['累计到款金额'] = dataBasicInfo['累计到款金额']
data['累计开票金额'] = dataBasicInfo['累计开票金额']
data['滚动年收入'] = dataBasicInfo['滚动年收入']
meanDaysGetIncome = (dataBasicInfo.ix[~dataBasicInfo['首次收款时间'].isna(), '首次收款时间'] - dataBasicInfo.ix[~dataBasicInfo['首次收款时间'].isna(),'项目立项完成时间']).mean().days # 已收款的项目,avg(当前时间-首次收款时间)
# 处理风险指标
data['CONTRACT_LINK_DATE'] = (dataBasicInfo['首次合同签订时间'] - dataBasicInfo['项目立项完成时间']).apply(lambda x: x.days) # 合同关联时间
data['PROJECT_IMPL_DATE'] = (dataBasicInfo['项目收入关闭日期'] - dataBasicInfo['项目立项完成时间']).apply(lambda x: x.days) # 项目实施时间
data['BUSINESS_FLOW_DATE'] = (dataBasicInfo['业务关闭时间'] - dataBasicInfo['项目收入关闭日期']).apply(lambda x: x.days) # 业务关闭时间差
data['FINANCIAL_CLOSE_DATE'] = (dataBasicInfo['财务关闭时间'] - dataBasicInfo['业务关闭时间']).apply(lambda x: x.days) # 财务关闭时间差
data['STATIC_DATE'] = (datetimeNow - dataBasicInfo['最后一次收款时间']).apply(lambda x: x.days) # 项目静止时间
data['ZERO_COLLECTION_DATE'] = 0 # 零收款项目时间
data.ix[dataBasicInfo['首次收款时间'].isna(), 'ZERO_COLLECTION_DATE'] = (datetimeNow - dataBasicInfo['项目立项完成时间']).apply(lambda x: x.days) - meanDaysGetIncome # 该指标只针对未收款项目
data.ix[data['ZERO_COLLECTION_DATE'] < 0, 'ZERO_COLLECTION_DATE'] = 0 # 最小值为0
data['PROJECT_CLOSE_DATE'] = (dataBasicInfo['业务关闭时间'] - dataBasicInfo['项目立项完成时间']).apply(lambda x: x.days) # 项目关闭周期,多余了
data['CURRENT_INCOME_OFFSET'] = ((dataBasicInfo['当前预算收入'] - dataBasicInfo['首次预算收入']) / dataBasicInfo['首次预算收入']).fillna(0)
data.ix[dataBasicInfo['首次预算收入'] == 0, 'CURRENT_INCOME_OFFSET'] = 0
data['COST_BUDGET_OFFSET'] = ((dataBasicInfo['当前预算成本'] - dataBasicInfo['首次预算成本']) / dataBasicInfo['首次预算成本']).fillna(0)
data.ix[dataBasicInfo['首次预算成本'] == 0, 'COST_BUDGET_OFFSET'] = 0
data['RAW_PROFIT_OFFSET'] = ((dataBasicInfo['当前预算毛利率'] - dataBasicInfo['首次毛利率']) / dataBasicInfo['首次毛利率']).fillna(0)
data.ix[dataBasicInfo['首次毛利率'] == 0, 'RAW_PROFIT_OFFSET'] = 0
data.ix[data['RAW_PROFIT_OFFSET'] < -100, 'RAW_PROFIT_OFFSET'] = -100
data.ix[data['RAW_PROFIT_OFFSET'] > 100, 'RAW_PROFIT_OFFSET'] = 100
incomeRateSelf = (dataBasicInfo['自作子项累计工作量'] / dataBasicInfo['自作子项累计预算收入']).apply(lambda x: x if ~np.isnan(x) and x != np.inf > 0 else 0) # 分母为0,则该值为0
coseRateSelf = (dataBasicInfo['累计材料成本'] / dataBasicInfo['自作材料预算成本之和']).apply(lambda x: x if ~np.isnan(x) and x != np.inf > 0 else 0) # 分母为0,则该值为0
data['MATERIAL_COST_PROGRESS'] = incomeRateSelf - coseRateSelf
data.ix[data['MATERIAL_COST_PROGRESS'] > 0, 'MATERIAL_COST_PROGRESS'] = 0 # 大于0的无风险
data['MATERIAL_COST_PROGRESS'] = - data['MATERIAL_COST_PROGRESS'] # 值越小风险越大, 取反
incomeRateOut = (dataBasicInfo['分包子项累计工作量'] / dataBasicInfo['分包子项累计预算收入']).apply(lambda x: x if ~np.isnan(x) and x != np.inf > 0 else 0) # 分母为0,则该值为0
coseRateOut = (dataBasicInfo['累计报账分包成本'] / dataBasicInfo['分包子项预算成本之和']).apply(lambda x: x if ~np.isnan(x) and x != np.inf > 0 else 0) # 分母为0,则该值为0
data['DISPTCH_COST_PROGRESS'] = incomeRateOut - coseRateOut
data.ix[data['DISPTCH_COST_PROGRESS'] > 0, 'DISPTCH_COST_PROGRESS'] = 0 # 大于0的无风险
data['DISPTCH_COST_PROGRESS'] = - data['DISPTCH_COST_PROGRESS'] # 值越小风险越大, 取反
data['CONTRCT_INCOME_TRANS_RATE'] = 0 # 合同收入转化率
data['CONTRCT_INCOME_TRANS_RATE'] = (1 - dataBasicInfo['项目累计列账收入']/dataBasicInfo['合同签订金额']).fillna(0)
data.ix[data['CONTRCT_INCOME_TRANS_RATE']<0, 'CONTRCT_INCOME_TRANS_RATE'] = 0
data.ix[data['CONTRCT_INCOME_TRANS_RATE']>1, 'CONTRCT_INCOME_TRANS_RATE'] = 1
data.ix[dataBasicInfo['关闭状态'] != '财务关闭', 'CONTRCT_INCOME_TRANS_RATE'] = 0 # 项目关闭后才考核该指标
data['CONTRCT_PROGRESS'] = 0 # 合同立项转入率
data['CONTRCT_PROGRESS'] = (1 - dataBasicInfo['合同签订金额'] / dataBasicInfo['当前预算收入']).fillna(0)
data.ix[data['CONTRCT_PROGRESS']<0, 'CONTRCT_PROGRESS'] = 0
data.ix[data['CONTRCT_PROGRESS']>1, 'CONTRCT_PROGRESS'] = 1
data.ix[dataBasicInfo['合同数量'] <= 1, 'CONTRCT_PROGRESS'] = 0 # 前提是一个项目对应多个合同
return data
if __name__ == '__main__':
db = cx_Oracle.connect('nfdw','nfdw','172.16.104.120:1521/ora120') # 创建连接
# basicInfo = get_data_from_oracle(db, 'A_GTF_SG_BASIC_INFO')
basicInfo = get_data_from_oracle(db, 'A_GTF_SG_BASIC_INFO')
# # 缺失值情况等,原数据
# colsName = []
# for col in basicInfo.columns:
# colsName.extend(basicInfo[col].describe().index)
# colsName = set(colsName)
# result = pd.DataFrame([], columns=colsName)
# for col in basicInfo.columns:
# print('****************', col, '****************')
# result = pd.concat((result, pd.DataFrame(basicInfo[col].describe()).T), axis=0)
# result.to_csv('C:/Users/Canon/Desktop/result1.csv', encoding='gbk')
# 缺失值情况等,计算得到的字段
colsName = []
for col in basicInfo.columns:
colsName.extend(basicInfo[col].describe().index)
colsName = set(colsName)
result = pd.DataFrame([], columns=colsName)
for col in basicInfo.columns:
print('****************', col, '****************')
result = pd.concat((result, pd.DataFrame(basicInfo[col].describe()).T), axis=0)
result.to_csv('C:/Users/Canon/Desktop/result1.csv', encoding='gbk')