SlideShare une entreprise Scribd logo
1  sur  9
Télécharger pour lire hors ligne
#part1
pisaTrain = read.csv("pisa2009train.csv")
pisaTest = read.csv("pisa2009test.csv")
str(pisaTrain)
summary(pisaTrain)
pairs(pisaTrain[,18:24])
#average readinScore per gender
tapply(pisaTrain$readingScore, pisaTrain$male, mean)
#missing variables
summary(pisaTrain)
pisaTrain = na.omit(pisaTrain)
pisaTest = na.omit(pisaTest)
summary(pisaTrain)
summary(pisaTest)
plot(pisaTest$raceeth)
str(pisaTest$raceeth)
pisaTrain$raceeth = relevel(pisaTrain$raceeth, "White")
pisaTest$raceeth = relevel(pisaTest$raceeth, "White")
lmScore = lm(readingScore~., data=pisaTrain)
summary(lmScore)
SSE = sum(lmScore$residuals^2)
RMSE =sqrt(mean(lmScore$residuals^2))
predTest = predict(lmScore, newdata=pisaTest)
summary(predTest)
rmse=sqrt(mean((predTest-pisaTest$readingScore)^2))
SSE=sum((predTest-pisaTest$readingScore)^2)
baseline = mean(pisaTrain$readingScore)
SST=sum((baseline-pisaTest$readingScore)^2)
r2=1-SSE/SST
plot(lmScore)
hist(pisaTrain$readingScore)
#part2
Elantra = read.csv("elantra.csv")
fix(Elantra)
pairs(Elantra)
cor(Elantra)
library(car)
fit <- lm(ElantraSales~., data=Elantra)
vif(fit)
# Global test of model assumptions
install.packages("gvlma")
library(gvlma)
gvmodel <- gvlma(fit)
summary(gvmodel)
ElantraTrain = subset(Elantra, Year <= 2012)
ElantraTest = subset(Elantra, Year > 2012)
ElantraLM = lm(ElantraSales ~ Unemployment + Queries + CPI_energy + CPI_all, data=ElantraTrain)
summary(ElantraLM)
ElantraLM = lm(ElantraSales ~ Unemployment + Queries + CPI_energy + CPI_all + Month,
data=ElantraTrain)
summary(ElantraLM)
ElantraTrain$MonthFactor = as.factor(ElantraTrain$Month)
ElantraTest$MonthFactor = as.factor(ElantraTest$Month)
ElantraLM = lm(ElantraSales ~ Unemployment + Queries + CPI_energy + CPI_all + MonthFactor,
data=ElantraTrain)
summary(ElantraLM)
cor(ElantraTrain[c("Unemployment","Month","Queries","CPI_energy","CPI_all")])
PredictTest = predict(ElantraLM, newdata=ElantraTest)
SSE = sum((PredictTest - ElantraTest$ElantraSales)^2)
SST = sum((mean(ElantraTrain$ElantraSales) - ElantraTest$ElantraSales)^2)
#part 3
mod1=lm(medv~lstat*age,data = Boston)
summary(mod1)
mod2=lm(medv~lstat+age+lstat*age,data = Boston)
summary(mod2)
mod3=lm(medv~lstat:age,data = Boston)
summary(mod3)
#part4 Nonlinear Relationship
install.packages("ILSR")
library(ISLR)
?Auto
pairs(Auto)
attach(Auto)
plot(horsepower,mpg)
summary(lm(mpg~horsepower))
par(mfrow=c(2,2))
plot(lm(mpg~horsepower))
summary(lm(mpg~horsepower+I(horsepower^2)))
plot(lm(mpg~horsepower+I(horsepower^2)))
summary(lm(mpg~poly(horsepower,7)))
plot(lm(mpg~poly(horsepower,7)))
#cross validation
install.packages("boot")
library(boot)
set.seed(2017)
cv.error.10=rep(0,10)
for (i in 1:10){
glm.fit=glm(mpg~poly(horsepower,i),data=Auto)
cv.error.10[i]=cv.glm(Auto,glm.fit,K=10)$delta[1]
}
cv.error.10
par(mfrow=c(1,1))
plot(seq(1:10),cv.error.10,type = "l")
#logit
qual=read.csv("quality.csv")
fix(qual)
summary(qual)
str(qual)
pairs(qual)
hist(qual$PoorCare)
table(qual$PoorCare)
install.packages("caTools")
library(caTools)
split=sample.split(qual$PoorCare,SplitRatio = 0.8)
split
qTrain=subset(qual,split==TRUE)
qTest=subset(qual,split==FALSE)
plot(qTrain$OfficeVisits,qTrain$Narcotics,col=qTrain$PoorCare+1)
logitmodel=glm(PoorCare ~OfficeVisits+Narcotics,data=qTrain,family=binomial)
summary(logitmodel)
logitmodel1=glm(PoorCare ~.,data=qTrain,family=binomial)
summary(logitmodel1)
predTrain=predict(logitmodel,type = "response")
hist(predTrain)
tapply(predTrain,qTrain$PoorCare,mean)
t(table(qTrain$PoorCare,predTrain>0.5))
t(table(qTrain$PoorCare,predTrain>0.1))
t(table(qTrain$PoorCare,predTrain>0.07))
install.packages("ROCR")
library(ROCR)
ROCpredict=prediction(predTrain,qTrain$PoorCare)
ROCperf=performance(ROCpredict,"tpr","fpr")
plot(ROCperf)
plot(ROCperf,print.cutoffs.at=c(0,25,0.5,0.75))
auc.perf = performance(ROCpredict, measure = "auc")
auc.perf@y.values
plot(qTrain$OfficeVisits,qTrain$Narcotics,col=qTrain$PoorCare+1)
#build decision boundary
coefs = coef(logitmodel)
x = c(0,40)
y = c((-1/coefs[3]) * (coefs[2] * x + coefs[1]))
lines(x, y, col="black", lwd=2)
predTst = predict(logitmodel,type="response",newdata = qTest)
t(table(qTest$PoorCare,predTst>0.5))
#Framingham
Fdata = read.csv("framingham.csv")
str(Fdata)
library(caTools)
set.seed(2018)
split=sample.split(Fdata$TenYearCHD,SplitRatio = 0.8)
Ftrain=subset(Fdata,split==TRUE)
Ftest=subset(Fdata,split==FALSE)
table(Ftrain$TenYearCHD)
table(Ftest$TenYearCHD)
frLogitMod=glm(TenYearCHD~.,data=Ftrain)
summary(frLogitMod)
predTst = predict(frLogitMod,type="response",newdata = Ftest)
t(table(Ftest$TenYearCHD,predTst>0.5 ))
#roc for test
ROCpredict=prediction(predTst,Ftest$TenYearCHD)
ROCperf=performance(ROCpredict,"tpr","fpr")
plot(ROCperf)
auc.perf = performance(ROCpredict, measure = "auc")
auc.perf@y.values
#loans
Dloans=read.csv("loans.csv")
fix(Dloans)
table(Dloans$credit.policy)
str(Dloans)
#identify Na variable
summary(Dloans)
missing = subset(Dloans, is.na(log.annual.inc) | is.na(days.with.cr.line) | is.na(revol.util) |
is.na(inq.last.6mths) | is.na(delinq.2yrs) | is.na(pub.rec))
nrow(missing)
library(caTools)
set.seed(2018)
split=sample.split(Dloans$not.fully.paid,SplitRatio = 0.8)
Dtrain=subset(Dloans,split==TRUE)
Dtest=subset(Dloans,split==FALSE)
#imputation
install.packages("mice")
library(mice)
#train imputation
vars.for.imputation = setdiff(names(Dtrain), "not.fully.paid")
imputed = complete(mice(Dtrain[vars.for.imputation]))
Dtrain[vars.for.imputation] = imputed
summary(Dtrain)
#test imputation
vars.for.imputation = setdiff(names(Dtest), "not.fully.paid")
imputed = complete(mice(Dtest[vars.for.imputation]))
Dtest[vars.for.imputation] = imputed
summary(Dtest)
#prediction
mod1 = glm(not.fully.paid~., data=Dtrain, family="binomial")
summary(mod1)
#prediction
Dtest$predicted.risk = predict(mod1, newdata=Dtest, type="response")
t(table(Dtest$not.fully.paid, Dtest$predicted.risk > 0.5))
#ROC
library(ROCR)
pred = prediction(Dtest$predicted.risk, Dtest$not.fully.paid)
as.numeric(performance(pred, "auc")@y.values)
ROCperf=performance(pred,"tpr","fpr")
plot(ROCperf)

Contenu connexe

Similaire à מערכות לומדות: תרגילי כיתה 4 ו-5

CL metaprogramming
CL metaprogrammingCL metaprogramming
CL metaprogramming
dudarev
 
Introduction to Perl
Introduction to PerlIntroduction to Perl
Introduction to Perl
Sway Wang
 

Similaire à מערכות לומדות: תרגילי כיתה 4 ו-5 (16)

Climbing the Abstract Syntax Tree (PHP Developer Days Dresden 2018)
Climbing the Abstract Syntax Tree (PHP Developer Days Dresden 2018)Climbing the Abstract Syntax Tree (PHP Developer Days Dresden 2018)
Climbing the Abstract Syntax Tree (PHP Developer Days Dresden 2018)
 
Climbing the Abstract Syntax Tree (Forum PHP 2017)
Climbing the Abstract Syntax Tree (Forum PHP 2017)Climbing the Abstract Syntax Tree (Forum PHP 2017)
Climbing the Abstract Syntax Tree (Forum PHP 2017)
 
Rcommands-for those who interested in R.
Rcommands-for those who interested in R.Rcommands-for those who interested in R.
Rcommands-for those who interested in R.
 
Climbing the Abstract Syntax Tree (Southeast PHP 2018)
Climbing the Abstract Syntax Tree (Southeast PHP 2018)Climbing the Abstract Syntax Tree (Southeast PHP 2018)
Climbing the Abstract Syntax Tree (Southeast PHP 2018)
 
Climbing the Abstract Syntax Tree (Bulgaria PHP 2016)
Climbing the Abstract Syntax Tree (Bulgaria PHP 2016)Climbing the Abstract Syntax Tree (Bulgaria PHP 2016)
Climbing the Abstract Syntax Tree (Bulgaria PHP 2016)
 
Climbing the Abstract Syntax Tree (Midwest PHP 2020)
Climbing the Abstract Syntax Tree (Midwest PHP 2020)Climbing the Abstract Syntax Tree (Midwest PHP 2020)
Climbing the Abstract Syntax Tree (Midwest PHP 2020)
 
Climbing the Abstract Syntax Tree (PHP UK 2018)
Climbing the Abstract Syntax Tree (PHP UK 2018)Climbing the Abstract Syntax Tree (PHP UK 2018)
Climbing the Abstract Syntax Tree (PHP UK 2018)
 
Climbing the Abstract Syntax Tree (PHP Russia 2019)
Climbing the Abstract Syntax Tree (PHP Russia 2019)Climbing the Abstract Syntax Tree (PHP Russia 2019)
Climbing the Abstract Syntax Tree (PHP Russia 2019)
 
Ch2
Ch2Ch2
Ch2
 
CL metaprogramming
CL metaprogrammingCL metaprogramming
CL metaprogramming
 
Introduction to Perl
Introduction to PerlIntroduction to Perl
Introduction to Perl
 
Climbing the Abstract Syntax Tree (php[world] 2019)
Climbing the Abstract Syntax Tree (php[world] 2019)Climbing the Abstract Syntax Tree (php[world] 2019)
Climbing the Abstract Syntax Tree (php[world] 2019)
 
Phylogenetics in R
Phylogenetics in RPhylogenetics in R
Phylogenetics in R
 
Perl6 grammars
Perl6 grammarsPerl6 grammars
Perl6 grammars
 
Implementing Software Machines in C and Go
Implementing Software Machines in C and GoImplementing Software Machines in C and Go
Implementing Software Machines in C and Go
 
PHP - Introduction to String Handling
PHP -  Introduction to  String Handling PHP -  Introduction to  String Handling
PHP - Introduction to String Handling
 

Plus de Igor Kleiner

Plus de Igor Kleiner (20)

Анализ данных просто и доступно - урок 1
Анализ данных просто и доступно - урок 1Анализ данных просто и доступно - урок 1
Анализ данных просто и доступно - урок 1
 
מדעי נתונים לכל אחד
מדעי נתונים לכל אחדמדעי נתונים לכל אחד
מדעי נתונים לכל אחד
 
מדע נתונים - למידה מכונות
מדע נתונים - למידה מכונותמדע נתונים - למידה מכונות
מדע נתונים - למידה מכונות
 
מבוא למדעי הנתונים שבוע 2
מבוא למדעי הנתונים שבוע 2מבוא למדעי הנתונים שבוע 2
מבוא למדעי הנתונים שבוע 2
 
מבוא למדעי הנתונים הרצאה 1
מבוא למדעי הנתונים הרצאה 1מבוא למדעי הנתונים הרצאה 1
מבוא למדעי הנתונים הרצאה 1
 
תכנות דינמי הרצאה 3
תכנות דינמי הרצאה 3תכנות דינמי הרצאה 3
תכנות דינמי הרצאה 3
 
תכנות דינמי הרצאה 4
תכנות דינמי הרצאה 4תכנות דינמי הרצאה 4
תכנות דינמי הרצאה 4
 
שאלות לתרגול עצמי
שאלות לתרגול עצמישאלות לתרגול עצמי
שאלות לתרגול עצמי
 
פתרון תרגיל 3
פתרון תרגיל 3פתרון תרגיל 3
פתרון תרגיל 3
 
מבוא לתכנות מדעי: פייתון הרצאה 13
מבוא לתכנות מדעי: פייתון הרצאה 13מבוא לתכנות מדעי: פייתון הרצאה 13
מבוא לתכנות מדעי: פייתון הרצאה 13
 
תכנות מדעי פייתון: הרצאה 12: סיבוכיות
תכנות מדעי פייתון: הרצאה 12: סיבוכיותתכנות מדעי פייתון: הרצאה 12: סיבוכיות
תכנות מדעי פייתון: הרצאה 12: סיבוכיות
 
מבוא לתכנות מדעי: פייתון: הרצאה 11: דבגינג + תכנות דינמי
מבוא לתכנות מדעי: פייתון: הרצאה 11: דבגינג + תכנות דינמימבוא לתכנות מדעי: פייתון: הרצאה 11: דבגינג + תכנות דינמי
מבוא לתכנות מדעי: פייתון: הרצאה 11: דבגינג + תכנות דינמי
 
תכנות מדעי: פייתון: הרצאה 10: : תחום הכרעה
תכנות מדעי: פייתון: הרצאה 10: : תחום הכרעהתכנות מדעי: פייתון: הרצאה 10: : תחום הכרעה
תכנות מדעי: פייתון: הרצאה 10: : תחום הכרעה
 
מבוא לתכנות מדעי: פייתון: הרצאה 9: 2017
מבוא לתכנות מדעי: פייתון: הרצאה 9: 2017מבוא לתכנות מדעי: פייתון: הרצאה 9: 2017
מבוא לתכנות מדעי: פייתון: הרצאה 9: 2017
 
תכנות מדעי: פייתון: הרצאה 8: 2017
תכנות מדעי: פייתון: הרצאה 8:  2017תכנות מדעי: פייתון: הרצאה 8:  2017
תכנות מדעי: פייתון: הרצאה 8: 2017
 
תכנות מדעי: פייתון : הרצאה 7: 2017
תכנות מדעי: פייתון : הרצאה 7: 2017תכנות מדעי: פייתון : הרצאה 7: 2017
תכנות מדעי: פייתון : הרצאה 7: 2017
 
תכנות מדעי: פייתון: הרצאה 6: קבצים, רשימות
תכנות מדעי: פייתון: הרצאה 6: קבצים, רשימותתכנות מדעי: פייתון: הרצאה 6: קבצים, רשימות
תכנות מדעי: פייתון: הרצאה 6: קבצים, רשימות
 
מבוא לתכנות מדעי: פייתון: הרצאה 5: 2017
מבוא לתכנות מדעי: פייתון: הרצאה 5: 2017מבוא לתכנות מדעי: פייתון: הרצאה 5: 2017
מבוא לתכנות מדעי: פייתון: הרצאה 5: 2017
 
מבוא לתכנות מדעי: פייתון: הרצאה 4: 2017
מבוא לתכנות מדעי: פייתון: הרצאה 4: 2017מבוא לתכנות מדעי: פייתון: הרצאה 4: 2017
מבוא לתכנות מדעי: פייתון: הרצאה 4: 2017
 
מבוא לתכנות מדעי: פייתון: הרצאה 3: לולאות
מבוא לתכנות מדעי: פייתון: הרצאה 3: לולאותמבוא לתכנות מדעי: פייתון: הרצאה 3: לולאות
מבוא לתכנות מדעי: פייתון: הרצאה 3: לולאות
 

Dernier

Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhi
Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in DelhiRussian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhi
Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhi
kauryashika82
 
Ecosystem Interactions Class Discussion Presentation in Blue Green Lined Styl...
Ecosystem Interactions Class Discussion Presentation in Blue Green Lined Styl...Ecosystem Interactions Class Discussion Presentation in Blue Green Lined Styl...
Ecosystem Interactions Class Discussion Presentation in Blue Green Lined Styl...
fonyou31
 
1029-Danh muc Sach Giao Khoa khoi 6.pdf
1029-Danh muc Sach Giao Khoa khoi  6.pdf1029-Danh muc Sach Giao Khoa khoi  6.pdf
1029-Danh muc Sach Giao Khoa khoi 6.pdf
QucHHunhnh
 
Kisan Call Centre - To harness potential of ICT in Agriculture by answer farm...
Kisan Call Centre - To harness potential of ICT in Agriculture by answer farm...Kisan Call Centre - To harness potential of ICT in Agriculture by answer farm...
Kisan Call Centre - To harness potential of ICT in Agriculture by answer farm...
Krashi Coaching
 

Dernier (20)

Sports & Fitness Value Added Course FY..
Sports & Fitness Value Added Course FY..Sports & Fitness Value Added Course FY..
Sports & Fitness Value Added Course FY..
 
Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhi
Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in DelhiRussian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhi
Russian Escort Service in Delhi 11k Hotel Foreigner Russian Call Girls in Delhi
 
Accessible design: Minimum effort, maximum impact
Accessible design: Minimum effort, maximum impactAccessible design: Minimum effort, maximum impact
Accessible design: Minimum effort, maximum impact
 
9548086042 for call girls in Indira Nagar with room service
9548086042  for call girls in Indira Nagar  with room service9548086042  for call girls in Indira Nagar  with room service
9548086042 for call girls in Indira Nagar with room service
 
Ecosystem Interactions Class Discussion Presentation in Blue Green Lined Styl...
Ecosystem Interactions Class Discussion Presentation in Blue Green Lined Styl...Ecosystem Interactions Class Discussion Presentation in Blue Green Lined Styl...
Ecosystem Interactions Class Discussion Presentation in Blue Green Lined Styl...
 
Paris 2024 Olympic Geographies - an activity
Paris 2024 Olympic Geographies - an activityParis 2024 Olympic Geographies - an activity
Paris 2024 Olympic Geographies - an activity
 
Mattingly "AI & Prompt Design: The Basics of Prompt Design"
Mattingly "AI & Prompt Design: The Basics of Prompt Design"Mattingly "AI & Prompt Design: The Basics of Prompt Design"
Mattingly "AI & Prompt Design: The Basics of Prompt Design"
 
Interactive Powerpoint_How to Master effective communication
Interactive Powerpoint_How to Master effective communicationInteractive Powerpoint_How to Master effective communication
Interactive Powerpoint_How to Master effective communication
 
Mattingly "AI & Prompt Design: Structured Data, Assistants, & RAG"
Mattingly "AI & Prompt Design: Structured Data, Assistants, & RAG"Mattingly "AI & Prompt Design: Structured Data, Assistants, & RAG"
Mattingly "AI & Prompt Design: Structured Data, Assistants, & RAG"
 
Student login on Anyboli platform.helpin
Student login on Anyboli platform.helpinStudent login on Anyboli platform.helpin
Student login on Anyboli platform.helpin
 
Presentation by Andreas Schleicher Tackling the School Absenteeism Crisis 30 ...
Presentation by Andreas Schleicher Tackling the School Absenteeism Crisis 30 ...Presentation by Andreas Schleicher Tackling the School Absenteeism Crisis 30 ...
Presentation by Andreas Schleicher Tackling the School Absenteeism Crisis 30 ...
 
Código Creativo y Arte de Software | Unidad 1
Código Creativo y Arte de Software | Unidad 1Código Creativo y Arte de Software | Unidad 1
Código Creativo y Arte de Software | Unidad 1
 
Q4-W6-Restating Informational Text Grade 3
Q4-W6-Restating Informational Text Grade 3Q4-W6-Restating Informational Text Grade 3
Q4-W6-Restating Informational Text Grade 3
 
Sanyam Choudhary Chemistry practical.pdf
Sanyam Choudhary Chemistry practical.pdfSanyam Choudhary Chemistry practical.pdf
Sanyam Choudhary Chemistry practical.pdf
 
General AI for Medical Educators April 2024
General AI for Medical Educators April 2024General AI for Medical Educators April 2024
General AI for Medical Educators April 2024
 
social pharmacy d-pharm 1st year by Pragati K. Mahajan
social pharmacy d-pharm 1st year by Pragati K. Mahajansocial pharmacy d-pharm 1st year by Pragati K. Mahajan
social pharmacy d-pharm 1st year by Pragati K. Mahajan
 
1029-Danh muc Sach Giao Khoa khoi 6.pdf
1029-Danh muc Sach Giao Khoa khoi  6.pdf1029-Danh muc Sach Giao Khoa khoi  6.pdf
1029-Danh muc Sach Giao Khoa khoi 6.pdf
 
APM Welcome, APM North West Network Conference, Synergies Across Sectors
APM Welcome, APM North West Network Conference, Synergies Across SectorsAPM Welcome, APM North West Network Conference, Synergies Across Sectors
APM Welcome, APM North West Network Conference, Synergies Across Sectors
 
Call Girls in Dwarka Mor Delhi Contact Us 9654467111
Call Girls in Dwarka Mor Delhi Contact Us 9654467111Call Girls in Dwarka Mor Delhi Contact Us 9654467111
Call Girls in Dwarka Mor Delhi Contact Us 9654467111
 
Kisan Call Centre - To harness potential of ICT in Agriculture by answer farm...
Kisan Call Centre - To harness potential of ICT in Agriculture by answer farm...Kisan Call Centre - To harness potential of ICT in Agriculture by answer farm...
Kisan Call Centre - To harness potential of ICT in Agriculture by answer farm...
 

מערכות לומדות: תרגילי כיתה 4 ו-5

  • 1. #part1 pisaTrain = read.csv("pisa2009train.csv") pisaTest = read.csv("pisa2009test.csv") str(pisaTrain) summary(pisaTrain) pairs(pisaTrain[,18:24]) #average readinScore per gender tapply(pisaTrain$readingScore, pisaTrain$male, mean) #missing variables summary(pisaTrain) pisaTrain = na.omit(pisaTrain) pisaTest = na.omit(pisaTest) summary(pisaTrain) summary(pisaTest) plot(pisaTest$raceeth) str(pisaTest$raceeth) pisaTrain$raceeth = relevel(pisaTrain$raceeth, "White") pisaTest$raceeth = relevel(pisaTest$raceeth, "White") lmScore = lm(readingScore~., data=pisaTrain) summary(lmScore) SSE = sum(lmScore$residuals^2)
  • 2. RMSE =sqrt(mean(lmScore$residuals^2)) predTest = predict(lmScore, newdata=pisaTest) summary(predTest) rmse=sqrt(mean((predTest-pisaTest$readingScore)^2)) SSE=sum((predTest-pisaTest$readingScore)^2) baseline = mean(pisaTrain$readingScore) SST=sum((baseline-pisaTest$readingScore)^2) r2=1-SSE/SST plot(lmScore) hist(pisaTrain$readingScore) #part2 Elantra = read.csv("elantra.csv") fix(Elantra) pairs(Elantra) cor(Elantra) library(car) fit <- lm(ElantraSales~., data=Elantra) vif(fit) # Global test of model assumptions install.packages("gvlma") library(gvlma) gvmodel <- gvlma(fit) summary(gvmodel)
  • 3. ElantraTrain = subset(Elantra, Year <= 2012) ElantraTest = subset(Elantra, Year > 2012) ElantraLM = lm(ElantraSales ~ Unemployment + Queries + CPI_energy + CPI_all, data=ElantraTrain) summary(ElantraLM) ElantraLM = lm(ElantraSales ~ Unemployment + Queries + CPI_energy + CPI_all + Month, data=ElantraTrain) summary(ElantraLM) ElantraTrain$MonthFactor = as.factor(ElantraTrain$Month) ElantraTest$MonthFactor = as.factor(ElantraTest$Month) ElantraLM = lm(ElantraSales ~ Unemployment + Queries + CPI_energy + CPI_all + MonthFactor, data=ElantraTrain) summary(ElantraLM) cor(ElantraTrain[c("Unemployment","Month","Queries","CPI_energy","CPI_all")]) PredictTest = predict(ElantraLM, newdata=ElantraTest) SSE = sum((PredictTest - ElantraTest$ElantraSales)^2) SST = sum((mean(ElantraTrain$ElantraSales) - ElantraTest$ElantraSales)^2) #part 3 mod1=lm(medv~lstat*age,data = Boston) summary(mod1) mod2=lm(medv~lstat+age+lstat*age,data = Boston) summary(mod2)
  • 4. mod3=lm(medv~lstat:age,data = Boston) summary(mod3) #part4 Nonlinear Relationship install.packages("ILSR") library(ISLR) ?Auto pairs(Auto) attach(Auto) plot(horsepower,mpg) summary(lm(mpg~horsepower)) par(mfrow=c(2,2)) plot(lm(mpg~horsepower)) summary(lm(mpg~horsepower+I(horsepower^2))) plot(lm(mpg~horsepower+I(horsepower^2))) summary(lm(mpg~poly(horsepower,7))) plot(lm(mpg~poly(horsepower,7))) #cross validation install.packages("boot") library(boot) set.seed(2017)
  • 5. cv.error.10=rep(0,10) for (i in 1:10){ glm.fit=glm(mpg~poly(horsepower,i),data=Auto) cv.error.10[i]=cv.glm(Auto,glm.fit,K=10)$delta[1] } cv.error.10 par(mfrow=c(1,1)) plot(seq(1:10),cv.error.10,type = "l") #logit qual=read.csv("quality.csv") fix(qual) summary(qual) str(qual) pairs(qual) hist(qual$PoorCare) table(qual$PoorCare) install.packages("caTools") library(caTools) split=sample.split(qual$PoorCare,SplitRatio = 0.8) split qTrain=subset(qual,split==TRUE) qTest=subset(qual,split==FALSE)
  • 6. plot(qTrain$OfficeVisits,qTrain$Narcotics,col=qTrain$PoorCare+1) logitmodel=glm(PoorCare ~OfficeVisits+Narcotics,data=qTrain,family=binomial) summary(logitmodel) logitmodel1=glm(PoorCare ~.,data=qTrain,family=binomial) summary(logitmodel1) predTrain=predict(logitmodel,type = "response") hist(predTrain) tapply(predTrain,qTrain$PoorCare,mean) t(table(qTrain$PoorCare,predTrain>0.5)) t(table(qTrain$PoorCare,predTrain>0.1)) t(table(qTrain$PoorCare,predTrain>0.07)) install.packages("ROCR") library(ROCR) ROCpredict=prediction(predTrain,qTrain$PoorCare) ROCperf=performance(ROCpredict,"tpr","fpr") plot(ROCperf) plot(ROCperf,print.cutoffs.at=c(0,25,0.5,0.75)) auc.perf = performance(ROCpredict, measure = "auc") auc.perf@y.values plot(qTrain$OfficeVisits,qTrain$Narcotics,col=qTrain$PoorCare+1) #build decision boundary
  • 7. coefs = coef(logitmodel) x = c(0,40) y = c((-1/coefs[3]) * (coefs[2] * x + coefs[1])) lines(x, y, col="black", lwd=2) predTst = predict(logitmodel,type="response",newdata = qTest) t(table(qTest$PoorCare,predTst>0.5)) #Framingham Fdata = read.csv("framingham.csv") str(Fdata) library(caTools) set.seed(2018) split=sample.split(Fdata$TenYearCHD,SplitRatio = 0.8) Ftrain=subset(Fdata,split==TRUE) Ftest=subset(Fdata,split==FALSE) table(Ftrain$TenYearCHD) table(Ftest$TenYearCHD) frLogitMod=glm(TenYearCHD~.,data=Ftrain) summary(frLogitMod) predTst = predict(frLogitMod,type="response",newdata = Ftest) t(table(Ftest$TenYearCHD,predTst>0.5 )) #roc for test ROCpredict=prediction(predTst,Ftest$TenYearCHD)
  • 8. ROCperf=performance(ROCpredict,"tpr","fpr") plot(ROCperf) auc.perf = performance(ROCpredict, measure = "auc") auc.perf@y.values #loans Dloans=read.csv("loans.csv") fix(Dloans) table(Dloans$credit.policy) str(Dloans) #identify Na variable summary(Dloans) missing = subset(Dloans, is.na(log.annual.inc) | is.na(days.with.cr.line) | is.na(revol.util) | is.na(inq.last.6mths) | is.na(delinq.2yrs) | is.na(pub.rec)) nrow(missing) library(caTools) set.seed(2018) split=sample.split(Dloans$not.fully.paid,SplitRatio = 0.8) Dtrain=subset(Dloans,split==TRUE) Dtest=subset(Dloans,split==FALSE) #imputation install.packages("mice") library(mice)
  • 9. #train imputation vars.for.imputation = setdiff(names(Dtrain), "not.fully.paid") imputed = complete(mice(Dtrain[vars.for.imputation])) Dtrain[vars.for.imputation] = imputed summary(Dtrain) #test imputation vars.for.imputation = setdiff(names(Dtest), "not.fully.paid") imputed = complete(mice(Dtest[vars.for.imputation])) Dtest[vars.for.imputation] = imputed summary(Dtest) #prediction mod1 = glm(not.fully.paid~., data=Dtrain, family="binomial") summary(mod1) #prediction Dtest$predicted.risk = predict(mod1, newdata=Dtest, type="response") t(table(Dtest$not.fully.paid, Dtest$predicted.risk > 0.5)) #ROC library(ROCR) pred = prediction(Dtest$predicted.risk, Dtest$not.fully.paid) as.numeric(performance(pred, "auc")@y.values) ROCperf=performance(pred,"tpr","fpr") plot(ROCperf)