Stata备忘录

1. 画图

(1)时间趋势图
label var year "年份"
label var per "制造业增加值比重[左轴]"
label var tjj "工业增加值比重[右轴]"
graph twoway (connect per year ,yaxis(1) color(black) )   ///
(connect tjj year ,yaxis(2) color(black) lpattern(dash)  ) ///
, graphregion(color(white))  xlabel(2003(2)2019) ///
ytitle("世界银行制造业增加值比重(%)",axis(1) height(5))  ///
ytitle("国家统计局工业增加值比重(%)",axis(2) height(5))  ///
note(数据来源:World Bank Open Data、国家统计局) xline(2011)

stata bysort回归后输出结果_数据


等价命令

tw (connect value1819 season , /// 
	lcolor(black) lpattern(dash) msymbol(O) mlcolor(gs5) mfcolor(gs12)) /// 
(connect value2020 season , ///
	lcolor(black) lpattern(solid) msymbol(S) mlcolor(gs5) mfcolor(gs12)) ///
,graphregion(color(white)) ///
legend(label(1 "18-19年平均") label(2 "2020年") ) ///
xlabel(1 "第一季度"  2 "第二季度"  3 "第三季度"  4 "第四季度"  ,labsize(small) )
label var year "年份"

tw bar mR1 year,yaxis(2) bc(balck) sort barwidth(0.9) fintensity(inten0) ///
ylabel(0(2000)6000, axis(2)) /// 
xlabel(2014(1)2021)|| /// 
connect percent_R year,yaxis(1) lc(black) lp(dash) mc(blace) ///
ylabel(0.5 "50%" 0.6 "60%" 0.7 "70%" 0.8 "80%" ,axis(1))  ||, ///
graphregion(color(white) ) ///
bgcolor(white) ///
title("中国数字内容企业(游戏)收入金额及占全球市场比重", c(black) size(*0.8)) ///
ytitle("占比(%)",axis(1) height(7))  ///
ytitle("收入额(百万美元)",axis(2) height(5))  /// 
legend(label(1 "中国数字内容企业(游戏)收入占全球市场比重") label(2 "中国数字内容企业(游戏)收入金额") ) ///
legend(size(small) col(1)) ///
note("数据源自:app annie")

graph save "Graph" "$path\output\playdata_1_percent_and_value_of_Chinese_Apps_Export.gph",replace

stata bysort回归后输出结果_人工智能_02

use hs_adj_year_PQV_2000_2015.dta,clear
use hs_adj_year_PQV_2000_2015_cregime10,clear

merge m:1 hs_adj using equipment
replace BEC=4 if BEC==1 & equipment!=1

destring hs_adj,replace

reghdfe lnV i.year if year!=2006 & BEC==2, a(hs)
est store result_accessories

reghdfe lnV i.year if year!=2006 & BEC==4, a(hs)
est store result_equipment

reghdfe lnV i.year if year!=2006 & BEC==0, a(hs)
est store result_noncapital



#d ;
coefplot 
  (result_accessories,c(l) label("accessories") lp(dash) lc(black) mc(black) ms(smcircle_hollow) offset(-0.07))  
  (result_equipment ,c(l)  label("equipment") lp(solid) lc(black) mc(black) ms(smcircle_hollow))
  (result_noncapital,c(l)  label("noncapital") lp(dot) lc(black) mc(black) ms(smcircle_hollow) offset(0.07))
  , vertical 
  drop(_cons) byopts(xrescale) 
  xlabel(1 "2001"  3"2003"  5"2005" 6"2007"  8"2009"  10"2011" 12"2013" 14"2015")
  graphregion(color(white))
  legend(size(small) col(3))
  ;
#d cr

stata bysort回归后输出结果_数据_03

字体大小 option

字体大小option

description

zero

no size whatsoever, vanishingly small

minuscule

smallest

quarter_tiny

third_tiny

half_tiny

tiny

vsmall

small

medsmall

medium

medlarge

large

vlarge

huge

vhuge

largest

tenth

one-tenth the size of the graph

quarter

one-fourth the size of the graph

third

one-third the size of the graph

half

one-half the size of the graph

full

text the size of the graph

size

any size you want

节点样式 eg: msymbol(O) mlcolor(gs5) mfcolor(gs12)

symbolstyle

Synonym(if any)

Description

circle

O

solid

diamond

D

solid

triangle

T

solid

square

S

solid

plus

+

X

X

arrowf

A

filled arrow head

arrow

a

pipe

V

V

smcircle

o

solid

smdiamond

d

solid

smsquare

s

solid

smtriangle

t

solid

smplus

smx

x

smv

v

circle_hollow

Oh

hollow

diamond_hollow

Dh

hollow

triangle_hollow

Th

hollow

square_hollow

Sh

hollow

smcircle_hollow

oh

hollow

smdiamond_hollow

dh

hollow

smtriangle_hollow

th

hollow

smsquare_hollow

sh

hollow

point

p

a small dot

none

i

a symbol that is invisible

线样式

linepatternstyle

Description

solid

solid line

dash

dashed line

dot

dotted line

dash_dot

shortdash

shortdash_dot

longdash

longdash_dot

blank

invisible line

formula

e.g.,-. or --… etc.

A formula is composed of any combination of

l

solid line

_

(underscore) a long dash

-

(hyphen) a medium dash

.

short dash (almost a dot)

#

small amount of blank space

颜色

black

edkblue

gs12

lime

orange

blue

eggshell

gs13

ltblue

orange_red

bluishgray

eltblue

gs14

ltbluishgray

pink

bluishgray8

eltgreen

gs15

ltbluishgray8

purple

brown

emerald

gs16

ltkhaki

red

chocolate

emidblue

gs2

magenta

sand

cranberry

erose

gs3

maroon

sandb

cyan

forest_green

gs4

midblue

sienna

dimgray

gold

gs5

midgreen

stone

dkgreen

gray

gs6

mint

sunflowerlime

dknavy

green

gs7

navy

teal

dkorange

gs0

gs8

navy8

white

ebblue

gs1

gs9

none

yellow

ebg

gs10

khaki

olive

edkbg

gs11

lavender

olive_teal

(2)柱状图
#delimit ;
graph bar cn_wzje_80 cn_wzje_81 if wzlx==0, over(sec) bargap(-30)
  ytitle("吸引外资金额")
  legend( label(1 "08年前") label(2 "08年后") )
  title("各行业吸引外资2008年前后对比")
  subtitle("中国中西部服务业")
  note("中国中西部服务业") ;
#delimit cr

stata bysort回归后输出结果_人工智能_04

(3)散点图
tw (scatter delta_v2_v3 delta_v1_v2 if delta_v1_v2>=-0.3& delta_v1_v2 <=2.3 ///
&delta_v2_v3>= -0.3&delta_v2_v3<=2.3, ///
mlabel(hy4) mlc(black) mlabc(black) ms(x) mlabs(tiny)) ///
(fun y=x,range(-0.3 2.3)) , ///
xlab(-0.3(0.5)2.3) ylab(-0.3(0.5)2.3)  ///
graphregion(color(white)) ///
xline(0,lp(dash) lc(gs10)) ///
yline(0,lp(dash) lc(gs10)) ///
legend(ring(0) pos(5) order(2 "45°线")) ///
ytitle("11-15时段的增速") ///
xtitle("07-11时段的增速")

stata bysort回归后输出结果_数据_05

graph twoway (scatter wzje_CE0308 wzje_CE1419, mlabel(sec) mlabv(sec) ) (function y=x, range(0 0.11)) , ///
title("中国东部地区03-08对比中国东部地区14-19[金额]")  ///
ytitle("中国东部03-08") ///
xtitle("中国东部14-19") ///
legend(ring(0) pos(5) order(2 "45°线")) ///
graphregion(color(white))

stata bysort回归后输出结果_大数据_06

graph twoway (scatter c_AS c_WD, mlabel(cic03)  ) (lfit c_AS c_WD) , ///
title("东盟增速放缓 vs 世界增速放缓")  ///
ytitle("东盟")  ///
xtitle("世界") ///
legend(ring(0) pos(5) order(2 "拟合")) ///
graphregion(color(white))
(4)bgshade
bgshade ks, shaders(uu9)  ///
   twoway(connect lamda22 ks if treat==1&ks>=6&ks<=11 ||             ///  
          connect lamda22 ks if treat==0&ks>=6&ks<=11 , xlab(6(1)11) ///
   title("新冠疫情冲击下企业平均收入变化趋势"))

stata bysort回归后输出结果_数据_07

(5)coefplot
coefplot,  levels(90) vertical lcolor(black)mcolor(black) ///
 msymbol(circle_hollow) ytitle(估计系数, size(small)) ///
 ylabel(, labsize(small) angle(horizontal) nogrid) ///
 yline(0, lwidth(vthin)lpattern(solid) lcolor(black)) ///
 xtitle(事件发生时间, size(small)) ///
  title("(B)企业缴税的平行趋势检验") ///
 xlabel(0"." 1"2019s2" 2"2019s3" 3"2019s4" 4"2020s1" 5"2020s2" 6"2020s3" 7"2020s4")

stata bysort回归后输出结果_stata bysort回归后输出结果_08

reghdfe lnQ i.Year if elec == 1,a(i.citycode) vce(r)
est store elec_Q_1 

reghdfe lnV i.Year if elec == 1,a(i.citycode) vce(r)
est store elec_V_1 

reghdfe lnQ i.Year if elec == 0,a(i.citycode) vce(r)
est store elec_Q_0

reghdfe lnV i.Year if elec == 0,a(i.citycode) vce(r)
est store elec_V_0

coefplot (elec_Q_1,label("半导体电子元件相关企业进口数量") offset(0.05)  pstyle(p3)) ///
(elec_Q_0 ,label("非半导体电子元件相关企业进口数量") offset(-0.05)  pstyle(p4) ), ///
vertical drop(_cons) xline(0) ///
graphregion(color(white)) /// 
yline(0) ///
addplot(line @b @at,lp(dash) lwidth(*0.5)) /// 
legend(label(1 "半导体电子元件相关企业进口数量") label(2 "非半导体电子元件相关企业进口数量") )

stata bysort回归后输出结果_数据_09

(6)画系数和置信区间
twoway (scatter coef week) /// 
(rcap ci_lower ci_upper week, /// 
 lcolor(black) /// 
 mcolor(black) /// 
 lwidth(vthin) /// 
 lpattern(dash) ///
 msymbol(circle_hollow) ///
 legend(label(2 "99% CI"))) , ///
yline(0) ///
xtitle("")  ///
graphregion(fcolor(white)) ///
title("第X周的系数", size(medium)) /// 
name("Coef_all_I", replace)
(7)画直方图

一般使用kdensity

hist year if year>=1400 & year<=2010, freq bin(200) ylabel(0(500)2500) xtitle("Year") xline(1950 1980,lw(thin)) ///
     text(1500 1950 "Year=1950", place(w)) text(2000 1980 "Year=1980", place(w))

stata bysort回归后输出结果_ci_10

(8)画桑基图
cd $path\appdata
use Data_games.dta,clear
merge m:1 ParentCompanyName using "$path\data\company_city"
keep if _m == 3
drop _m
gen from = city_code
encode iso3_j,gen(to)
bys from to :egen tR = total(Revenue)
bys from to :egen tD = total(Downloads)
duplicates drop  from to ,force
gen x0 = 1
gen x1 = 2
tostring city_code ,gen(city2)
drop if dest == "CHN"

sankey_plot x0 from x1 to, ///
width0(tR) extra xlabel(1 "Source" 2 "Destination", nogrid labsize(small)) ///
colorpalette(economist, opacity(30)) ///
label0(city) label1(iso3_j) ///
labsize(*0.6) labcolor(black) ///
graphregion(color(white))  gap(0.1) ///
title("地级市层面Apps出海流向(按收入额)",color(black) size(*0.8))

graph save "Graph" "$path\output\sankey_R_0228.gph",replace


sankey_plot x0 from x1 to, ///
width0(tD) extra xlabel(1 "Source" 2 "Destination", nogrid labsize(small)) ///
colorpalette(economist, opacity(30)) ///
label0(city) label1(iso3_j) ///
labsize(*0.6) labcolor(black) ///
graphregion(color(white))  gap(0.1) ///
title("地级市层面Apps出海流向(按下载量)",color(black) size(*0.8))

graph save "Graph" "$path\output\sankey_D_0228.gph",replace

stata bysort回归后输出结果_数据_11

(9)气泡图
twoway(scatter mv T_gap_05_00 [fweight=N] if BEC == 0&T_gap_05_00!=0&N !=0,msymbol(Oh) mc(ebblue%40)) ///
(scatter mv T_gap_05_00 [fweight=N] if BEC == 4&T_gap_05_00!=0&N !=0,msymbol(Oh) mc(orange_red%40)) ///
(scatter mv T_gap_05_00 [fweight=N] if BEC == 2&T_gap_05_00!=0&N !=0,msymbol(Oh) mc(green%20)) ///
, legend(label(1 "非资本品") label(2 "equipment") label(3 "accessories") )

stata bysort回归后输出结果_数据_12

2. 处理数据

(1)拓展expand数据

例如:当前数据中 有9523、9524、9525、9526、9527、9528共计6个样本,现在想把这6个样本根据freq进行扩充,变为9523、9524_1、9524_2、9525_1、9525_2、9526_1、9526_2、9527_1、9527_2、9528_1、9528_2这11个样本

freq

count

value

1

9523

4845.1143

2

9524

969.66498

2

9525

129.53349

2

9526

71284.508

2

9527

1038.127

2

9528

445877.09

count是id的唯一识别码,expandcl函数可以生成freq行相同的样本,并生成一个新的id识别码freq_count

egen count=group(id hs02_6)
expandcl freq,gen(freq_count) cluster(count)
drop freq_count
(2)时间数据
gen R= mdy(month_r,day_r,year_r)
gen week_r = week(R)
gen day_r = day(R)
gen dow_r = dow(R)  //返回周几
gen doy_r = doy(R)  //返回年内日期
gen yw_r = yw(year_r,week_r)
gen ed = yw - yw_r
//yw ym yq yh分别为年周、年月、年季、年半年
gen period_kb= date(date_u,"YMD")-date(date_kb,"YMD")
(3)常见函数
int(x) //取整,不论后面的小数是什么,只取小数点前的数值
round(x) // 四舍五入取整
round(x, .01) //保留两位小数四舍五入
gen y = sum(x)  //求列累积和
egen y = sum(x) //求列总和
egen y = rsum(x y z) //求x+y+z总和
egen y = rowmean(x y z) //求(x+y+z)/3
egen y = rowsd(x y z) //求x y z的方差
egen y = rowmim(x y z) //求x y z的最小值
egen y = rowmax(x y z) //求x y z的最大值
egen y = mean(x)      //求列均值
egen y = median(x)    //求列中位数
egen y = std(x)       //求变异系数,与方差不同

bysort x(y): gen z = y[1] //按照x分组,分组后按照y排序,生成一个新变量z=y的第一个观察值
(4)缩尾处理
foreach v of var DexpoAS4- DlnexpoWD2{
gen `v'_w=`v'
qui su `v',det
replace `v'_w=r(p99) if `v'>r(p99) & `v'<.
replace `v'_w=r(p1) if `v'<r(p1)
}

winsor2 wage, replace cuts(1 99) trim

summary 一个变量之后,可以返回的结果有

r(N)           //number of observations
r(mean)        //mean
r(skewness)    //skewness (detail only)
r(min)         //minimum
r(max)         //maximum
r(sum_w)       //sum of the weights
r(p1)          //1st percentile (detail only)
r(p5)          //5th percentile (detail only)
r(p10)         //10th percentile (detail only)
r(p25)         //25th percentile (detail only)
r(p50)         //50th percentile (detail only)
r(p75)         //75th percentile (detail only)
r(p90)         //90th percentile (detail only)
r(p95)         //95th percentile (detail only)
r(p99)         //99th percentile (detail only)
r(Var)         //variance
r(kurtosis)    //kurtosis (detail only)
r(sum)         //sum of variable
r(sd)          //standard deviation
(5)创建文件夹

在project路径下生成一个workspace文件夹
再生成子文件夹储存数据(data)、控制变量(controlvars )、临时数据(tempdata)、回归结果(outreg)

efolder, cd(D:\stata15\project\workspace)
efolder, cd(D:\stata15\project\workspace sub(data controlvars tempdata outreg)
(6)分组处理数据(bysort的替代方案)
*展示根据highzupu50(族谱)和year分组后的变量drqianfen(死亡率)均值;
collapse (mean) drqianfen, by(highzupu50 year)
(7)定义无缺失的样本
g rsample = !mi(avggrain_fyr) & !mi(nograin_fyr) & !mi(urban_fyr)& !mi(dis_bj_fyr) & !mi(dis_pc_fyr) & !mi(migrants_fyr)& !mi(rice_fyr) & !mi(minor_fyr) & !mi(edu_fyr)
(8)定义Dummy的新替代式(时间range)
*如果yob满足1825≤yob≤1899则pre取值为1,否则pre取值为0。mid、post生成过程类似。
gen pre = inrange(yob, 1825, 1899)
gen mid = inrange(yob, 1899, 1919)
gen post = inrange(yob, 1920, 1960)
(9)快速替换
recode treatyear (1969 = 1) (1979 = 2) (1989 = 3) (1999 = 4) (2009 = 5)

3. 处理字符

(1)替换字符
replace 候选人姓名=subinstr( 候选人姓名, " ", "",. )
(2)捕捉字符中的某些特征
keep if strmatch(city, "*山东*")
gen temp = 1 if strmatch(reporteriso3, "A*")
(3)提取字符,检索特定字符
//从enddate字符1开始取,取4个字符赋给year
gen year = substr(enddate,1,4)  

//strpos(s1, s2)返回字符s2在s1中的位置,如果s1中找不到s2,则返回0,将该判断再赋给y
gen y = strpos(s1, s2) != 0

4. 输出结果

(1)常规输出
outreg2 using "E:\mfg\outreg\r2", word append addtext(CountryFE, YES,YearFE, YES)
(2)iv回归输出第一阶段
eststo: xtivreg p_a_w (DexpoCN4_w=dexpo44) /// 
 i.t c.expr0#t c.Lshare0#t /// 
 c.lnGDP0#t c.lngdp0#t, fe first vce(cluster c)  
 
 eststo: xtreg DexpoCN4_w dexpo44 /// 
 i.t c.expr0#t c.Lshare0#t /// 
 c.lnGDP0#t c.lngdp0#t if e(sample)==1 ,fe  
 cd $path\outreg 
 outreg2 using "table3", word replace addtext(CityFE, YES,YearFE, YES) keep(dexpo44)
(3)变量描述性统计
*列出inv等变量的样本数、均值、标准差、最小值和最大值。
tabstat inv loginv log_levies ///
     logpopl logincome logasset hhsize landpc logmigration logtax logtransfer share_admin  ///
	 postcont postopen secret_ballot proxy_voting moving_ballot ///
	 , s(N mean sd min max) c(s)

5.矩阵保存结果

mat T1 = J(3,3,.)

reghdfe temp ib1.season if year == 2018 & treat == 1,noa
forvalues i = 1/3{
	local j = `i' + 1
	mat T1[`i',1] = _b[`j'.season]
}

reghdfe temp ib1.season if year == 2019 & treat == 1,noa
forvalues i = 1/3{
	local j = `i' + 1
	mat T1[`i',2] = _b[`j'.season]
}

reghdfe temp ib1.season if year == 2020 & treat == 1,noa
forvalues i = 1/3{
	mat T1[`i',3] = _b[`=1+`i''.season]
}

svmat T1

6.导入数据(全字符串)

forv i = 2000/2003{
	cd E:\Data\EPS工企海关匹配库\origindata
	import delimited "工企+海关(`i').csv", stringcols(_all) clear 
	cd E:\Data\EPS工企海关匹配库
	save data`i'.dta,replace
}

7.循环

clear all
set obs 1000

**#** 用forvalues循环对单一变量进行处理

gen id = .    
//生成一个变量名为id,代表第几个人,假设一共有50个人
//假设每个人都有20个观测值,代表20年

forvalues i = 1/50 {
	local j = `i' - 1				//暂时定义0~49 方便计算
	local lower = `j' * 20  +1		//定义下限 1、21、41、61 
	local upper = `j' * 20 + 20		//定义上限 20、40、60、80
									//由此就定义了 1~20  21~40 41~60 ……
	replace id = `i' in `lower'/`upper'		//给第1~20行,赋值为第1个人
											//给第21~40行,赋值为第2个人
}

bys id : gen T = _n + 2000					//对于每个人,都生成一个时间序列



**#** 用forvalues循环对多个变量进行处理

forvalues i = 1/5 {
	gen value`i' = .
	cap gen e = rnormal()
	replace value`i' = e * 10 + `i'
	cap drop e
}

// 等价于 
gen value6 = .
cap gen e = rnormal()
replace value6 = e * 10 + 6
cap drop e

gen value7 = .
cap gen e = rnormal()
replace value7 = e * 10 + 7
cap drop e

gen value8 = .
cap gen e = rnormal()
replace value8 = e * 10 + 8
cap drop e

gen value9 = .
cap gen e = rnormal()
replace value9 = e * 10 + 9
cap drop e

gen value10 = .
cap gen e = rnormal()
replace value10 = e * 10 + 10
cap drop e


**#** 用while循环对单一变量进行处理
// 只要时间在T=11和T=20之间,就对value1~value10进行 " 乘0.1"的处理
local i = 2010 
while `i' < 2020 {
	forvalues j = 1/10{
		replace value`j' = value`j' * 0.1 if T == `i'
	}
	local i = `i' + 1
}

**#** 用foreach对变量进行处理
foreach v in value1 value2 value3 value4 value5  {
	su `v' ,d
	replace `v' = (`v' - r(min)) / (r(max) - r(min))
	kdensity `v'
}

// 等价于 
su value6,d 
replace value6 = (value6 - r(min)) / (r(max) - r(min))
kdensity value6

su value7,d 
replace value7 = (value7 - r(min)) / (r(max) - r(min))
kdensity value7

su value8,d 
replace value8 = (value8 - r(min)) / (r(max) - r(min))
kdensity value8

su value9,d 
replace value9 = (value9 - r(min)) / (r(max) - r(min))
kdensity value9

su value10,d 
replace value10 = (value10 - r(min)) / (r(max) - r(min))
kdensity value10

8.将第一行作为 label / varname

* 把第一行作为变量标签
labone,nrow(1) 

* 把第一行作为变量名(不保留第一行)
nrow

* 把第一行作为变量名(保留第一行)
nrow,keep