#第五章:线性回归模型   数据下载地址:https://github.com/johnmyleswhite/ML_for_Hackers

library(ggplot2)
 ages <- read.csv("E:\\ML_for_Hackers-master\\05-Regression\\data\\longevity.csv")
 ggplot(ages,aes(x = AgeAtDeath,fill = factor(Smokes))) + geom_density()+facet_grid(Smokes~.)
                                                       
 #均方误差(MSE)的计算,使用73岁作为在不知道如何条件下的最好预测年龄
 guess <- 73
 with(ages,mean((AgeAtDeath - guess)^2))


 #使用其他年龄范围作为预测,查看均方误差
 guess.accuracy <- data.frame()
 for(guess in seq(63,83,by = 1))
 {
   prediction.error <- with(ages,
                            mean((AgeAtDeath - guess)^2))
   guess.accuracy <- rbind(guess.accuracy,
                           data.frame(Guess = guess,
                                      Error = prediction.error))
 }


 #得到各个年龄对应的误差值,在73岁时最小
 ggplot(guess.accuracy,aes(x = Guess,y = Error)) + geom_point() + geom_line()


 #使用虚拟变量的回归模型
 #R语言计算均方根误差(RMSE)


 constant.guess <- with(ages,mean(AgeAtDeath))
 with(ages,sqrt(mean((AgeAtDeath - constant.guess)^2)))  #不包含吸烟信息的预测误差(RMSE)


 smokers.guess <- with(subset(ages,Smokes == 1),
                       mean(AgeAtDeath))
 non.smokers.guess <- with(subset(ages,Smokes == 0),
                           mean(AgeAtDeath))
 ages <- transform(ages,
                   NewPrediction = ifelse(Smokes == 0,
                                          non.smokers.guess,
                                          smokers.guess))
 with(ages,sqrt(mean((AgeAtDeath - NewPrediction)^2)))  #包含吸烟信息的预测误差(RMSE)


 #体重相对身高的散点图,增加了回归线后体重相对于身高的散点图
 library("ggplot2")
 heights.weights <- read.csv("E:\\ML_for_Hackers-master\\05-Regression\\data\\01_heights_weights_genders.csv",
                             header = TRUE,
                             sep = ",")
 ggplot(heights.weights,aes(x = Height,y = Weight)) +
   geom_point() +
   geom_smooth(method = "lm")  #在调用geom_smoth()时指明要用lm方法即可


 #线性回归模型
 fitted.regression <- lm(Weight ~ Height,
                         data = heights.weights)
 #一旦运行了对lm函数的调用,就可以通过调用coef函数来得到回归直线的截距,,coef函数返回将输入和输出结果联系
 #在一起的线性模型的系数


 coef(fitted.regression)  #得到回归直线的截距
 intercept <- coef(fitted.regression)[1]
 slope <- coef(fitted.regression)[2]


 predict(fitted.regression)


 true.values <- with(heights.weights,Weight)
 errors <- true.values - predict(fitted.regression)  #预测误差(残差)


 #可使用residuals函数替换predict函数来直接获得残差
 residuals(fitted.regression)


 #为了发现使用线性回归时产生的明显错误,可以把残差和真实数据对赢你画在一幅图中,这里指定which=1
 #让R语言只画出了第一个回归诊断点图
 plot(fitted.regression,which = 1)


 #机器学习使用RMSE评估机器学习算法的效果
 x <- 1:10
 y <- x^2
 fitted.regression <- lm(y ~ x)
 errors <- residuals(fitted.regression)




 #预测网页流量
 top.10000.sites <- read.csv("E:\\ML_for_Hackers-master\\05-Regression\\data\\top_1000_sites.csv",
                             stringsAsFactors = FALSE)


 #用ggplot函数获得散点图,几乎所有的数据值都在x轴的附件挤成一束,而只有非常少的数字跳出那一堆数
 #主要原因是因为是因为使用了非标准分布数据,数值跨度太大
 ggplot(top.10000.sites,aes(x = PageViews,y = UniqueVisitors)) + geom_point()


 #观察PageViews本身的分布
 ggplot(top.10000.sites,aes(x = PageViews)) +geom_density()  #该密度图同样不可理解


 #尝试取log函数
 ggplot(top.10000.sites,aes(x = log( PageViews))) + geom_density()  #得到的密度图比较合理


 #使用Log变换后的PageViews和UniqueVisitors
 ggplot(top.10000.sites, aes(x = log(PageViews), y = log(UniqueVisitors))) +
   geom_point() +
   geom_smooth(method = 'lm', se = FALSE)  #加入了一跳回归线


 #调用Lm函数来找到定义这条直线斜率和截距的数值
 lm.fit <- lm(log(PageViews) ~ log(UniqueVisitors),data = top.10000.sites)


 #summary 告我我们的第一件事:对Lm所做的调用,第二件事:残差的分位数,等价于quantile(residuls(lm.fit))
 summary(lm.fit)


 lm.fit <- lm(log(PageViews) ~ HasAdvertising + log(UniqueVisitors) + InEnglish,
                               data = top.10000.sites)
 summary(lm.fit)


 #在实践中,当输入容易获得时,值得将所有的输入都包含进一个预测模型,但是当HasAdvert是难以通过程序获得
 #时,那么可以将其去掉
 lm.fit <- lm(log(PageViewisings) ~ HasAdvert,data = top.10000.sites)
 summary(lm.fit)$r.squared  #解释了1%的方差  


 lm.fit <- lm(log(PageViews) ~ log(UniqueVisitors),
              data = top.10000.sites)
 summary(lm.fit)$r.squared  #解释了46%的方差


 lm.fit <- lm(log(PageViews) ~  InEnglish,
              data = top.10000.sites)
 summary(lm.fit)$r.squared  #解释了3%的方差


 #定义相关性
 x <- 1:10
 y <- x^2
 ggplot(data.frame(X = x, Y = y), aes(x = X, y = Y)) +
   geom_point() +
   geom_smooth(method = 'lm', se = FALSE)
 #用cor函数来估计点和线的线性关系
 cor(x,y)