Contenu connexe
Similaire à מערכות לומדות: תרגילי כיתה 4 ו-5 (16)
Plus de Igor Kleiner (20)
מערכות לומדות: תרגילי כיתה 4 ו-5
- 1. #part1
pisaTrain = read.csv("pisa2009train.csv")
pisaTest = read.csv("pisa2009test.csv")
str(pisaTrain)
summary(pisaTrain)
pairs(pisaTrain[,18:24])
#average readinScore per gender
tapply(pisaTrain$readingScore, pisaTrain$male, mean)
#missing variables
summary(pisaTrain)
pisaTrain = na.omit(pisaTrain)
pisaTest = na.omit(pisaTest)
summary(pisaTrain)
summary(pisaTest)
plot(pisaTest$raceeth)
str(pisaTest$raceeth)
pisaTrain$raceeth = relevel(pisaTrain$raceeth, "White")
pisaTest$raceeth = relevel(pisaTest$raceeth, "White")
lmScore = lm(readingScore~., data=pisaTrain)
summary(lmScore)
SSE = sum(lmScore$residuals^2)
- 2. RMSE =sqrt(mean(lmScore$residuals^2))
predTest = predict(lmScore, newdata=pisaTest)
summary(predTest)
rmse=sqrt(mean((predTest-pisaTest$readingScore)^2))
SSE=sum((predTest-pisaTest$readingScore)^2)
baseline = mean(pisaTrain$readingScore)
SST=sum((baseline-pisaTest$readingScore)^2)
r2=1-SSE/SST
plot(lmScore)
hist(pisaTrain$readingScore)
#part2
Elantra = read.csv("elantra.csv")
fix(Elantra)
pairs(Elantra)
cor(Elantra)
library(car)
fit <- lm(ElantraSales~., data=Elantra)
vif(fit)
# Global test of model assumptions
install.packages("gvlma")
library(gvlma)
gvmodel <- gvlma(fit)
summary(gvmodel)
- 3. ElantraTrain = subset(Elantra, Year <= 2012)
ElantraTest = subset(Elantra, Year > 2012)
ElantraLM = lm(ElantraSales ~ Unemployment + Queries + CPI_energy + CPI_all, data=ElantraTrain)
summary(ElantraLM)
ElantraLM = lm(ElantraSales ~ Unemployment + Queries + CPI_energy + CPI_all + Month,
data=ElantraTrain)
summary(ElantraLM)
ElantraTrain$MonthFactor = as.factor(ElantraTrain$Month)
ElantraTest$MonthFactor = as.factor(ElantraTest$Month)
ElantraLM = lm(ElantraSales ~ Unemployment + Queries + CPI_energy + CPI_all + MonthFactor,
data=ElantraTrain)
summary(ElantraLM)
cor(ElantraTrain[c("Unemployment","Month","Queries","CPI_energy","CPI_all")])
PredictTest = predict(ElantraLM, newdata=ElantraTest)
SSE = sum((PredictTest - ElantraTest$ElantraSales)^2)
SST = sum((mean(ElantraTrain$ElantraSales) - ElantraTest$ElantraSales)^2)
#part 3
mod1=lm(medv~lstat*age,data = Boston)
summary(mod1)
mod2=lm(medv~lstat+age+lstat*age,data = Boston)
summary(mod2)
- 4. mod3=lm(medv~lstat:age,data = Boston)
summary(mod3)
#part4 Nonlinear Relationship
install.packages("ILSR")
library(ISLR)
?Auto
pairs(Auto)
attach(Auto)
plot(horsepower,mpg)
summary(lm(mpg~horsepower))
par(mfrow=c(2,2))
plot(lm(mpg~horsepower))
summary(lm(mpg~horsepower+I(horsepower^2)))
plot(lm(mpg~horsepower+I(horsepower^2)))
summary(lm(mpg~poly(horsepower,7)))
plot(lm(mpg~poly(horsepower,7)))
#cross validation
install.packages("boot")
library(boot)
set.seed(2017)
- 5. cv.error.10=rep(0,10)
for (i in 1:10){
glm.fit=glm(mpg~poly(horsepower,i),data=Auto)
cv.error.10[i]=cv.glm(Auto,glm.fit,K=10)$delta[1]
}
cv.error.10
par(mfrow=c(1,1))
plot(seq(1:10),cv.error.10,type = "l")
#logit
qual=read.csv("quality.csv")
fix(qual)
summary(qual)
str(qual)
pairs(qual)
hist(qual$PoorCare)
table(qual$PoorCare)
install.packages("caTools")
library(caTools)
split=sample.split(qual$PoorCare,SplitRatio = 0.8)
split
qTrain=subset(qual,split==TRUE)
qTest=subset(qual,split==FALSE)
- 6. plot(qTrain$OfficeVisits,qTrain$Narcotics,col=qTrain$PoorCare+1)
logitmodel=glm(PoorCare ~OfficeVisits+Narcotics,data=qTrain,family=binomial)
summary(logitmodel)
logitmodel1=glm(PoorCare ~.,data=qTrain,family=binomial)
summary(logitmodel1)
predTrain=predict(logitmodel,type = "response")
hist(predTrain)
tapply(predTrain,qTrain$PoorCare,mean)
t(table(qTrain$PoorCare,predTrain>0.5))
t(table(qTrain$PoorCare,predTrain>0.1))
t(table(qTrain$PoorCare,predTrain>0.07))
install.packages("ROCR")
library(ROCR)
ROCpredict=prediction(predTrain,qTrain$PoorCare)
ROCperf=performance(ROCpredict,"tpr","fpr")
plot(ROCperf)
plot(ROCperf,print.cutoffs.at=c(0,25,0.5,0.75))
auc.perf = performance(ROCpredict, measure = "auc")
auc.perf@y.values
plot(qTrain$OfficeVisits,qTrain$Narcotics,col=qTrain$PoorCare+1)
#build decision boundary
- 7. coefs = coef(logitmodel)
x = c(0,40)
y = c((-1/coefs[3]) * (coefs[2] * x + coefs[1]))
lines(x, y, col="black", lwd=2)
predTst = predict(logitmodel,type="response",newdata = qTest)
t(table(qTest$PoorCare,predTst>0.5))
#Framingham
Fdata = read.csv("framingham.csv")
str(Fdata)
library(caTools)
set.seed(2018)
split=sample.split(Fdata$TenYearCHD,SplitRatio = 0.8)
Ftrain=subset(Fdata,split==TRUE)
Ftest=subset(Fdata,split==FALSE)
table(Ftrain$TenYearCHD)
table(Ftest$TenYearCHD)
frLogitMod=glm(TenYearCHD~.,data=Ftrain)
summary(frLogitMod)
predTst = predict(frLogitMod,type="response",newdata = Ftest)
t(table(Ftest$TenYearCHD,predTst>0.5 ))
#roc for test
ROCpredict=prediction(predTst,Ftest$TenYearCHD)
- 8. ROCperf=performance(ROCpredict,"tpr","fpr")
plot(ROCperf)
auc.perf = performance(ROCpredict, measure = "auc")
auc.perf@y.values
#loans
Dloans=read.csv("loans.csv")
fix(Dloans)
table(Dloans$credit.policy)
str(Dloans)
#identify Na variable
summary(Dloans)
missing = subset(Dloans, is.na(log.annual.inc) | is.na(days.with.cr.line) | is.na(revol.util) |
is.na(inq.last.6mths) | is.na(delinq.2yrs) | is.na(pub.rec))
nrow(missing)
library(caTools)
set.seed(2018)
split=sample.split(Dloans$not.fully.paid,SplitRatio = 0.8)
Dtrain=subset(Dloans,split==TRUE)
Dtest=subset(Dloans,split==FALSE)
#imputation
install.packages("mice")
library(mice)
- 9. #train imputation
vars.for.imputation = setdiff(names(Dtrain), "not.fully.paid")
imputed = complete(mice(Dtrain[vars.for.imputation]))
Dtrain[vars.for.imputation] = imputed
summary(Dtrain)
#test imputation
vars.for.imputation = setdiff(names(Dtest), "not.fully.paid")
imputed = complete(mice(Dtest[vars.for.imputation]))
Dtest[vars.for.imputation] = imputed
summary(Dtest)
#prediction
mod1 = glm(not.fully.paid~., data=Dtrain, family="binomial")
summary(mod1)
#prediction
Dtest$predicted.risk = predict(mod1, newdata=Dtest, type="response")
t(table(Dtest$not.fully.paid, Dtest$predicted.risk > 0.5))
#ROC
library(ROCR)
pred = prediction(Dtest$predicted.risk, Dtest$not.fully.paid)
as.numeric(performance(pred, "auc")@y.values)
ROCperf=performance(pred,"tpr","fpr")
plot(ROCperf)