Advertisements

# StatQuest: Logistic Regression in R

## NOTE: The data used in this demo comes from the UCI machine learning ## repository. ## http://archive.ics.uci.edu/ml/index.php ## Specifically, this is the heart disease data set. ## http://archive.ics.uci.edu/ml/datasets/Heart+Disease url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data" data <- read.csv(url, header=FALSE) ##################################### ## ## Reformat the data so that it is ## 1) Easy to use (add nice column names) ## 2) Interpreted correctly by glm().. ## ##################################### head(data) # you see data, but no column names colnames(data) <- c( "age", "sex",# 0 = female, 1 = male "cp", # chest pain # 1 = typical angina, # 2 = atypical angina, # 3 = non-anginal pain, # 4 = asymptomatic "trestbps", # resting blood pressure (in mm Hg) "chol", # serum cholestoral in mg/dl "fbs", # fasting blood sugar if less than 120 mg/dl, 1 = TRUE, 0 = FALSE "restecg", # resting electrocardiographic results # 1 = normal # 2 = having ST-T wave abnormality # 3 = showing probable or definite left ventricular hypertrophy "thalach", # maximum heart rate achieved "exang", # exercise induced angina, 1 = yes, 0 = no "oldpeak", # ST depression induced by exercise relative to rest "slope", # the slope of the peak exercise ST segment # 1 = upsloping # 2 = flat # 3 = downsloping "ca", # number of major vessels (0-3) colored by fluoroscopy "thal", # this is short of thalium heart scan # 3 = normal (no cold spots) # 6 = fixed defect (cold spots during rest and exercise) # 7 = reversible defect (when cold spots only appear during exercise) "hd" # (the predicted attribute) - diagnosis of heart disease # 0 if less than or equal to 50% diameter narrowing # 1 if greater than 50% diameter narrowing ) head(data) # now we have data and column names str(data) # this shows that we need to tell R which columns contain factors # it also shows us that there are some missing values. There are "?"s # in the dataset. These are in the "ca" and "thal" columns... ## First, convert "?"s to NAs... data[data == "?"] <- NA ## Now add factors for variables that are factors and clean up the factors ## that had missing data... data[data$sex == 0,]$sex <- "F" data[data$sex == 1,]$sex <- "M" data$sex <- as.factor(data$sex) data$cp <- as.factor(data$cp) data$fbs <- as.factor(data$fbs) data$restecg <- as.factor(data$restecg) data$exang <- as.factor(data$exang) data$slope <- as.factor(data$slope) data$ca <- as.integer(data$ca) # since this column had "?"s in it # R thinks that the levels for the factor are strings, but # we know they are integers, so first convert the strings to integiers... data$ca <- as.factor(data$ca) # ...then convert the integers to factor levels data$thal <- as.integer(data$thal) # "thal" also had "?"s in it. data$thal <- as.factor(data$thal) ## This next line replaces 0 and 1 with "Healthy" and "Unhealthy" data$hd <- ifelse(test=data$hd == 0, yes="Healthy", no="Unhealthy") data$hd <- as.factor(data$hd) # Now convert to a factor str(data) ## this shows that the correct columns are factors ## Now determine how many rows have "NA" (aka "Missing data"). If it's just ## a few, we can remove them from the dataset, otherwise we should consider ## imputing the values with a Random Forest or some other imputation method. nrow(data[is.na(data$ca) | is.na(data$thal),]) data[is.na(data$ca) | is.na(data$thal),] ## so 6 of the 303 rows of data have missing values. This isn't a large ## percentage (2%), so we can just remove them from the dataset ## NOTE: This is different from when we did machine learning with ## Random Forests. When we did that, we imputed values. nrow(data) data <- data[!(is.na(data$ca) | is.na(data$thal)),] nrow(data) ##################################### ## ## Now we can do some quality control by making sure all of the factor ## levels are represented by people with and without heart disease (hd) ## ## NOTE: We also want to exclude variables that only have 1 or 2 samples in ## a category since +/- one or two samples can have a large effect on the ## odds/log(odds) ## ## ##################################### xtabs(~ hd + sex, data=data) xtabs(~ hd + cp, data=data) xtabs(~ hd + fbs, data=data) xtabs(~ hd + restecg, data=data) xtabs(~ hd + exang, data=data) xtabs(~ hd + slope, data=data) xtabs(~ hd + ca, data=data) xtabs(~ hd + thal, data=data) ##################################### ## ## Now we are ready for some logistic regression. First we'll create a very ## simple model that uses sex to predict heart disease ## ##################################### ## let's start super simple and see if sex (female/male) is a good ## predictor... ## First, let's just look at the raw data... xtabs(~ hd + sex, data=data) # sex # hd F M # Healthy 71 89 # Unhealthy 25 112 ## Most of the females are healthy and most of the males are unhealthy. ## Being female is likely to decrease the odds in being unhealthy. ## In other words, if a sample is female, the odds are against it that it ## will be unhealthy ## Being male is likely to increase the odds in being unhealthy... ## In other words, if a sample is male, the odds are for it being unhealthy ########### ## ## Now do the actual logistic regression ## ########### logistic <- glm(hd ~ sex, data=data, family="binomial") summary(logistic) ## (Intercept) -1.0438 0.2326 -4.488 7.18e-06 *** ## sexM 1.2737 0.2725 4.674 2.95e-06 *** ## Let's start by going through the first coefficient... ## (Intercept) -1.0438 0.2326 -4.488 7.18e-06 *** ## ## The intercept is the log(odds) a female will be unhealthy. This is because ## female is the first factor in "sex" (the factors are ordered, ## alphabetically by default,"female", "male") female.log.odds <- log(25 / 71) female.log.odds ## Now let's look at the second coefficient... ## sexM 1.2737 0.2725 4.674 2.95e-06 *** ## ## sexM is the log(odds ratio) that tells us that if a sample has sex=M, the ## odds of being unhealthy are, on a log scale, 1.27 times greater than if ## a sample has sex=F. male.log.odds.ratio <- log((112 / 89) / (25/71)) male.log.odds.ratio ## Now calculate the overall "Pseudo R-squared" and its p-value ## NOTE: Since we are doing logistic regression... ## Null devaince = 2*(0 - LogLikelihood(null model)) ## = -2*LogLikihood(null model) ## Residual deviacne = 2*(0 - LogLikelihood(proposed model)) ## = -2*LogLikelihood(proposed model) ll.null <- logistic$null.deviance/-2 ll.proposed <- logistic$deviance/-2 ## McFadden's Pseudo R^2 = [ LL(Null) - LL(Proposed) ] / LL(Null) (ll.null - ll.proposed) / ll.null ## chi-square value = 2*(LL(Proposed) - LL(Null)) ## p-value = 1 - pchisq(chi-square value, df = 2-1) 1 - pchisq(2*(ll.proposed - ll.null), df=1) 1 - pchisq((logistic$null.deviance - logistic$deviance), df=1) ## Lastly, let's see what this logistic regression predicts, given ## that a patient is either female or male (and no other data about them). predicted.data <- data.frame( probability.of.hd=logistic$fitted.values, sex=data$sex) ## We can plot the data... ggplot(data=predicted.data, aes(x=sex, y=probability.of.hd)) + geom_point(aes(color=sex), size=5) + xlab("Sex") + ylab("Predicted probability of getting heart disease") ## Since there are only two probabilities (one for females and one for males), ## we can use a table to summarize the predicted probabilities. xtabs(~ probability.of.hd + sex, data=predicted.data) ##################################### ## ## Now we will use all of the data available to predict heart disease ## ##################################### logistic <- glm(hd ~ ., data=data, family="binomial") summary(logistic) ## Now calculate the overall "Pseudo R-squared" and its p-value ll.null <- logistic$null.deviance/-2 ll.proposed <- logistic$deviance/-2 ## McFadden's Pseudo R^2 = [ LL(Null) - LL(Proposed) ] / LL(Null) (ll.null - ll.proposed) / ll.null ## The p-value for the R^2 1 - pchisq(2*(ll.proposed - ll.null), df=(length(logistic$coefficients)-1)) ## now we can plot the data predicted.data <- data.frame( probability.of.hd=logistic$fitted.values, hd=data$hd) predicted.data <- predicted.data[ order(predicted.data$probability.of.hd, decreasing=FALSE),] predicted.data$rank <- 1:nrow(predicted.data) ## Lastly, we can plot the predicted probabilities for each sample having ## heart disease and color by whether or not they actually had heart disease library(ggplot2) library(cowplot) ggplot(data=predicted.data, aes(x=rank, y=probability.of.hd)) + geom_point(aes(color=hd), alpha=1, shape=4, stroke=2) + xlab("Index") + ylab("Predicted probability of getting heart disease") ggsave("heart_disease_probabilities.pdf")

# StatQuest: Random Forests in R

library(ggplot2) library(cowplot) library(randomForest) ## NOTE: The data used in this demo comes from the UCI machine learning ## repository. ## http://archive.ics.uci.edu/ml/index.php ## Specifically, this is the heart disease data set. ## http://archive.ics.uci.edu/ml/datasets/Heart+Disease url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data" data <- read.csv(url, header=FALSE) ##################################### ## ## Reformat the data so that it is ## 1) Easy to use (add nice column names) ## 2) Interpreted correctly by randomForest.. ## ##################################### head(data) # you see data, but no column names colnames(data) <- c( "age", "sex",# 0 = female, 1 = male "cp", # chest pain # 1 = typical angina, # 2 = atypical angina, # 3 = non-anginal pain, # 4 = asymptomatic "trestbps", # resting blood pressure (in mm Hg) "chol", # serum cholestoral in mg/dl "fbs", # fasting blood sugar greater than 120 mg/dl, 1 = TRUE, 0 = FALSE "restecg", # resting electrocardiographic results # 1 = normal # 2 = having ST-T wave abnormality # 3 = showing probable or definite left ventricular hypertrophy "thalach", # maximum heart rate achieved "exang", # exercise induced angina, 1 = yes, 0 = no "oldpeak", # ST depression induced by exercise relative to rest "slope", # the slope of the peak exercise ST segment # 1 = upsloping # 2 = flat # 3 = downsloping "ca", # number of major vessels (0-3) colored by fluoroscopy "thal", # this is short of thalium heart scan # 3 = normal (no cold spots) # 6 = fixed defect (cold spots during rest and exercise) # 7 = reversible defect (when cold spots only appear during exercise) "hd" # (the predicted attribute) - diagnosis of heart disease # 0 if less than or equal to 50% diameter narrowing # 1 if greater than 50% diameter narrowing ) head(data) # now we have data and column names str(data) # this shows that we need to tell R which columns contain factors # it also shows us that there are some missing values. There are "?"s # in the dataset. ## First, replace "?"s with NAs. data[data == "?"] <- NA ## Now add factors for variables that are factors and clean up the factors ## that had missing data... data[data$sex == 0,]$sex <- "F" data[data$sex == 1,]$sex <- "M" data$sex <- as.factor(data$sex) data$cp <- as.factor(data$cp) data$fbs <- as.factor(data$fbs) data$restecg <- as.factor(data$restecg) data$exang <- as.factor(data$exang) data$slope <- as.factor(data$slope) data$ca <- as.integer(data$ca) # since this column had "?"s in it (which # we have since converted to NAs) R thinks that # the levels for the factor are strings, but # we know they are integers, so we'll first # convert the strings to integiers... data$ca <- as.factor(data$ca) # ...then convert the integers to factor levels data$thal <- as.integer(data$thal) # "thal" also had "?"s in it. data$thal <- as.factor(data$thal) ## This next line replaces 0 and 1 with "Healthy" and "Unhealthy" data$hd <- ifelse(test=data$hd == 0, yes="Healthy", no="Unhealthy") data$hd <- as.factor(data$hd) # Now convert to a factor str(data) ## this shows that the correct columns are factors and we've replaced ## "?"s with NAs because "?" no longer appears in the list of factors ## for "ca" and "thal" ##################################### ## ## Now we are ready to build a random forest. ## ##################################### set.seed(42) ## NOTE: For most machine learning methods, you need to divide the data ## manually into a "training" set and a "test" set. This allows you to train ## the method using the training data, and then test it on data it was not ## originally trained on. ## ## In contrast, Random Forests split the data into "training" and "test" sets ## for you. This is because Random Forests use bootstrapped ## data, and thus, not every sample is used to build every tree. The ## "training" dataset is the bootstrapped data and the "test" dataset is ## the remaining samples. The remaining samples are called ## the "Out-Of-Bag" (OOB) data. ## impute any missing values in the training set using proximities data.imputed <- rfImpute(hd ~ ., data = data, iter=6) ## NOTE: iter = the number of iterations to run. Breiman says 4 to 6 iterations ## is usually good enough. With this dataset, when we set iter=6, OOB-error ## bounces around between 17% and 18%. When we set iter=20, # set.seed(42) # data.imputed <- rfImpute(hd ~ ., data = data, iter=20) ## we get values a little better and a little worse, so doing more ## iterations doesn't improve the situation. ## ## NOTE: If you really want to micromanage how rfImpute(), ## you can change the number of trees it makes (the default is 300) and the ## number of variables that it will consider at each step. ## Now we are ready to build a random forest. ## NOTE: If the thing we're trying to predict (in this case it is ## whether or not someone has heart disease) is a continuous number ## (i.e. "weight" or "height"), then by default, randomForest() will set ## "mtry", the number of variables to consider at each step, ## to the total number of variables divided by 3 (rounded down), or to 1 ## (if the division results in a value less than 1). ## If the thing we're trying to predict is a "factor" (i.e. either "yes/no" ## or "ranked"), then randomForest() will set mtry to ## the square root of the number of variables (rounded down to the next ## integer value). ## In this example, "hd", the thing we are trying to predict, is a factor and ## there are 13 variables. So by default, randomForest() will set ## mtry = sqrt(13) = 3.6 rounded down = 3 ## Also, by default random forest generates 500 trees (NOTE: rfImpute() only ## generates 300 tress by default) model <- randomForest(hd ~ ., data=data.imputed, proximity=TRUE) ## RandomForest returns all kinds of things model # gives us an overview of the call, along with... # 1) The OOB error rate for the forest with ntree trees. # In this case ntree=500 by default # 2) The confusion matrix for the forest with ntree trees. # The confusion matrix is laid out like this: # # Healthy Unhealthy # -------------------------------------------------------------- # Healthy | Number of healthy people | Number of healthy people | # | correctly called "healthy" | incorectly called "unhealthy" | # | by the forest. | by the forest | # -------------------------------------------------------------- # Unhealthy| Number of unhealthy people | Number of unhealthy peole | # | incorrectly called | correctly called "unhealthy" | # | "healthy" by the forest | by the forest | # -------------------------------------------------------------- ## Now check to see if the random forest is actually big enough... ## Up to a point, the more trees in the forest, the better. You can tell when ## you've made enough when the OOB no longer improves. oob.error.data <- data.frame( Trees=rep(1:nrow(model$err.rate), times=3), Type=rep(c("OOB", "Healthy", "Unhealthy"), each=nrow(model$err.rate)), Error=c(model$err.rate[,"OOB"], model$err.rate[,"Healthy"], model$err.rate[,"Unhealthy"])) ggplot(data=oob.error.data, aes(x=Trees, y=Error)) + geom_line(aes(color=Type)) # ggsave("oob_error_rate_500_trees.pdf") ## Blue line = The error rate specifically for calling "Unheathly" patients that ## are OOB. ## ## Green line = The overall OOB error rate. ## ## Red line = The error rate specifically for calling "Healthy" patients ## that are OOB. ## NOTE: After building a random forest with 500 tress, the graph does not make ## it clear that the OOB-error has settled on a value or, if we added more ## trees, it would continue to decrease. ## So we do the whole thing again, but this time add more trees. model <- randomForest(hd ~ ., data=data.imputed, ntree=1000, proximity=TRUE) model oob.error.data <- data.frame( Trees=rep(1:nrow(model$err.rate), times=3), Type=rep(c("OOB", "Healthy", "Unhealthy"), each=nrow(model$err.rate)), Error=c(model$err.rate[,"OOB"], model$err.rate[,"Healthy"], model$err.rate[,"Unhealthy"])) ggplot(data=oob.error.data, aes(x=Trees, y=Error)) + geom_line(aes(color=Type)) # ggsave("oob_error_rate_1000_trees.pdf") ## After building a random forest with 1,000 trees, we get the same OOB-error ## 16.5% and we can see convergence in the graph. So we could have gotten ## away with only 500 trees, but we wouldn't have been sure that number ## was enough. ## If we want to compare this random forest to others with different values for ## mtry (to control how many variables are considered at each step)... oob.values <- vector(length=10) for(i in 1:10) { temp.model <- randomForest(hd ~ ., data=data.imputed, mtry=i, ntree=1000) oob.values[i] <- temp.model$err.rate[nrow(temp.model$err.rate),1] } oob.values ## [1] 0.1716172 0.1716172 0.1617162 0.1848185 0.1749175 0.1947195 0.1815182 ## [8] 0.2013201 0.1881188 0.1947195 ## The lowest value is when mtry=3, so the default setting was the best. ## Now let's create an MDS-plot to show how the samples are related to each ## other. ## ## Start by converting the proximity matrix into a distance matrix. distance.matrix <- dist(1-model$proximity) mds.stuff <- cmdscale(distance.matrix, eig=TRUE, x.ret=TRUE) ## calculate the percentage of variation that each MDS axis accounts for... mds.var.per <- round(mds.stuff$eig/sum(mds.stuff$eig)*100, 1) ## now make a fancy looking plot that shows the MDS axes and the variation: mds.values <- mds.stuff$points mds.data <- data.frame(Sample=rownames(mds.values), X=mds.values[,1], Y=mds.values[,2], Status=data.imputed$hd) ggplot(data=mds.data, aes(x=X, y=Y, label=Sample)) + geom_text(aes(color=Status)) + theme_bw() + xlab(paste("MDS1 - ", mds.var.per[1], "%", sep="")) + ylab(paste("MDS2 - ", mds.var.per[2], "%", sep="")) + ggtitle("MDS plot using (1 - Random Forest Proximities)")