library(ggplot2) library(cowplot) ## NOTE: The data used in this demo comes from the UCI machine learning ## repository. ## http://archive.ics.uci.edu/ml/index.php ## Specifically, this is the heart disease data set. ## http://archive.ics.uci.edu/ml/datasets/Heart+Disease url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data" data <- read.csv(url, header=FALSE) ##################################### ## ## Reformat the data so that it is ## 1) Easy to use (add nice column names) ## 2) Interpreted correctly by glm().. ## ##################################### head(data) # you see data, but no column names colnames(data) <- c( "age", "sex",# 0 = female, 1 = male "cp", # chest pain # 1 = typical angina, # 2 = atypical angina, # 3 = non-anginal pain, # 4 = asymptomatic "trestbps", # resting blood pressure (in mm Hg) "chol", # serum cholestoral in mg/dl "fbs", # fasting blood sugar if less than 120 mg/dl, 1 = TRUE, 0 = FALSE "restecg", # resting electrocardiographic results # 1 = normal # 2 = having ST-T wave abnormality # 3 = showing probable or definite left ventricular hypertrophy "thalach", # maximum heart rate achieved "exang", # exercise induced angina, 1 = yes, 0 = no "oldpeak", # ST depression induced by exercise relative to rest "slope", # the slope of the peak exercise ST segment # 1 = upsloping # 2 = flat # 3 = downsloping "ca", # number of major vessels (0-3) colored by fluoroscopy "thal", # this is short of thalium heart scan # 3 = normal (no cold spots) # 6 = fixed defect (cold spots during rest and exercise) # 7 = reversible defect (when cold spots only appear during exercise) "hd" # (the predicted attribute) - diagnosis of heart disease # 0 if less than or equal to 50% diameter narrowing # 1 if greater than 50% diameter narrowing ) head(data) # now we have data and column names str(data) # this shows that we need to tell R which columns contain factors # it also shows us that there are some missing values. There are "?"s # in the dataset. These are in the "ca" and "thal" columns... ## First, convert "?"s to NAs... data[data == "?"] <- NA ## Now add factors for variables that are factors and clean up the factors ## that had missing data... data[data$sex == 0,]$sex <- "F" data[data$sex == 1,]$sex <- "M" data$sex <- as.factor(data$sex) data$cp <- as.factor(data$cp) data$fbs <- as.factor(data$fbs) data$restecg <- as.factor(data$restecg) data$exang <- as.factor(data$exang) data$slope <- as.factor(data$slope) data$ca <- as.integer(data$ca) # since this column had "?"s in it # R thinks that the levels for the factor are strings, but # we know they are integers, so first convert the strings to integiers... data$ca <- as.factor(data$ca) # ...then convert the integers to factor levels data$thal <- as.integer(data$thal) # "thal" also had "?"s in it. data$thal <- as.factor(data$thal) ## This next line replaces 0 and 1 with "Healthy" and "Unhealthy" data$hd <- ifelse(test=data$hd == 0, yes="Healthy", no="Unhealthy") data$hd <- as.factor(data$hd) # Now convert to a factor str(data) ## this shows that the correct columns are factors ## Now determine how many rows have "NA" (aka "Missing data"). If it's just ## a few, we can remove them from the dataset, otherwise we should consider ## imputing the values with a Random Forest or some other imputation method. nrow(data[is.na(data$ca) | is.na(data$thal),]) data[is.na(data$ca) | is.na(data$thal),] ## so 6 of the 303 rows of data have missing values. This isn't a large ## percentage (2%), so we can just remove them from the dataset ## NOTE: This is different from when we did machine learning with ## Random Forests. When we did that, we imputed values. nrow(data) data <- data[!(is.na(data$ca) | is.na(data$thal)),] nrow(data) ##################################### ## ## Now we can do some quality control by making sure all of the factor ## levels are represented by people with and without heart disease (hd) ## ## NOTE: We also want to exclude variables that only have 1 or 2 samples in ## a category since +/- one or two samples can have a large effect on the ## odds/log(odds) ## ## ##################################### xtabs(~ hd + sex, data=data) xtabs(~ hd + cp, data=data) xtabs(~ hd + fbs, data=data) xtabs(~ hd + restecg, data=data) xtabs(~ hd + exang, data=data) xtabs(~ hd + slope, data=data) xtabs(~ hd + ca, data=data) xtabs(~ hd + thal, data=data) ##################################### ## ## Now we are ready for some logistic regression. First we'll create a very ## simple model that uses sex to predict heart disease ## ##################################### ## let's start super simple and see if sex (female/male) is a good ## predictor... ## First, let's just look at the raw data... xtabs(~ hd + sex, data=data) # sex # hd F M # Healthy 71 89 # Unhealthy 25 112 ## Most of the females are healthy and most of the males are unhealthy. ## Being female is likely to decrease the odds in being unhealthy. ## In other words, if a sample is female, the odds are against it that it ## will be unhealthy ## Being male is likely to increase the odds in being unhealthy... ## In other words, if a sample is male, the odds are for it being unhealthy ########### ## ## Now do the actual logistic regression ## ########### logistic <- glm(hd ~ sex, data=data, family="binomial") summary(logistic) ## (Intercept) -1.0438 0.2326 -4.488 7.18e-06 *** ## sexM 1.2737 0.2725 4.674 2.95e-06 *** ## Let's start by going through the first coefficient... ## (Intercept) -1.0438 0.2326 -4.488 7.18e-06 *** ## ## The intercept is the log(odds) a female will be unhealthy. This is because ## female is the first factor in "sex" (the factors are ordered, ## alphabetically by default,"female", "male") female.log.odds <- log(25 / 71) female.log.odds ## Now let's look at the second coefficient... ## sexM 1.2737 0.2725 4.674 2.95e-06 *** ## ## sexM is the log(odds ratio) that tells us that if a sample has sex=M, the ## odds of being unhealthy are, on a log scale, 1.27 times greater than if ## a sample has sex=F. male.log.odds.ratio <- log((112 / 89) / (25/71)) male.log.odds.ratio ## Now calculate the overall "Pseudo R-squared" and its p-value ## NOTE: Since we are doing logistic regression... ## Null devaince = 2*(0 - LogLikelihood(null model)) ## = -2*LogLikihood(null model) ## Residual deviacne = 2*(0 - LogLikelihood(proposed model)) ## = -2*LogLikelihood(proposed model) ll.null <- logistic$null.deviance/-2 ll.proposed <- logistic$deviance/-2 ## McFadden's Pseudo R^2 = [ LL(Null) - LL(Proposed) ] / LL(Null) (ll.null - ll.proposed) / ll.null ## chi-square value = 2*(LL(Proposed) - LL(Null)) ## p-value = 1 - pchisq(chi-square value, df = 2-1) 1 - pchisq(2*(ll.proposed - ll.null), df=1) 1 - pchisq((logistic$null.deviance - logistic$deviance), df=1) ## Lastly, let's see what this logistic regression predicts, given ## that a patient is either female or male (and no other data about them). predicted.data <- data.frame( probability.of.hd=logistic$fitted.values, sex=data$sex) ## We can plot the data... ggplot(data=predicted.data, aes(x=sex, y=probability.of.hd)) + geom_point(aes(color=sex), size=5) + xlab("Sex") + ylab("Predicted probability of getting heart disease") ## Since there are only two probabilities (one for females and one for males), ## we can use a table to summarize the predicted probabilities. xtabs(~ probability.of.hd + sex, data=predicted.data) ##################################### ## ## Now we will use all of the data available to predict heart disease ## ##################################### logistic <- glm(hd ~ ., data=data, family="binomial") summary(logistic) ## Now calculate the overall "Pseudo R-squared" and its p-value ll.null <- logistic$null.deviance/-2 ll.proposed <- logistic$deviance/-2 ## McFadden's Pseudo R^2 = [ LL(Null) - LL(Proposed) ] / LL(Null) (ll.null - ll.proposed) / ll.null ## The p-value for the R^2 1 - pchisq(2*(ll.proposed - ll.null), df=(length(logistic$coefficients)-1)) ## now we can plot the data predicted.data <- data.frame( probability.of.hd=logistic$fitted.values, hd=data$hd) predicted.data <- predicted.data[ order(predicted.data$probability.of.hd, decreasing=FALSE),] predicted.data$rank <- 1:nrow(predicted.data) ## Lastly, we can plot the predicted probabilities for each sample having ## heart disease and color by whether or not they actually had heart disease ggplot(data=predicted.data, aes(x=rank, y=probability.of.hd)) + geom_point(aes(color=hd), alpha=1, shape=4, stroke=2) + xlab("Index") + ylab("Predicted probability of getting heart disease") ggsave("heart_disease_probabilities.pdf")

Advertisements

line 142 try:

logistic <- glm(hd ~ sex, data=data, family="binomial")

summary(logistic)

LikeLike

Thanks for catching that. There was something odd with including the first line of the output from the summary() command. Once I deleted that line (which was commented out), the original code came back. Strange! But I’m very grateful you spotted the error before it became a problem for other people. Thanks!

LikeLike

Hi Josh, just let you know there is a problem with this video. Many thanks for your great website.

Cheers!

LikeLike

What problem are you having? I just played it back and it seemed fine.

LikeLike

It says that the video is unavailable!

LikeLike

Try reloading the page and see what happens. It should work – there might be something funky going on with your cache. If it doesn’t work, try this link: https://youtu.be/C4N3_XJJ-jU

LikeLike

I am not able to get the data. So to begin learning is impossible.

LikeLike

It appears that the website that hosts the data is down right now. Try again tomorrow. I’m sure they will get the site running again soon.

LikeLike

Hi Josh,

Excellent videos. Your explanation has helped me grasp how to perform logistic regression in R. I was wondering whether you could demonstrate how to put the data in a bar graph with 95% confidence intervals, like is done in academic papers.

LikeLike

That’s a great idea! I’ll put it on the to-do list.

LikeLike

This is a great idea! I’ll put it on the to-do list.

LikeLike

the ggplot is not able to plot the output.

Error in FUN(X[[i]], …) : object ‘hd’ not found

LikeLike

I can’t recreate that error. Are you sure you created “predicted.data” correctly?

predicted.data <- data.frame(

probability.of.hd=logistic$fitted.values,

hd=data$hd)

LikeLike

Josh, when I run the following script, there is an error:

> predicted.data <- data.frame(

+ probability.of.hd=logistic$fitted.values,

+ hd=data$hd)

Error in data.frame(probability.of.hd = logistic$fitted.values, hd = data$hd) :

arguments imply differing number of rows: 297, 303

How can we resolve it?

Thanks!

LikeLike

Awesome. Video and R script has really helped me a lot.

LikeLike

I can’t recreate that error. Can you try running the code from the start and see if it happens again. Your error suggests that you might have skipped the line when we removed samples with “NA” in them:

data <- data[!(is.na(data$ca) | is.na(data$thal)),]

LikeLike

Thanks for your prompt reply, Josh! I’ll let you know when I try it tonight.

LikeLike

Hi Josh,

The ggplot works well!

A) May I know if I’ve had to remove all NA in the data set? Or only in some if the variables?

B) Do we have to do a randomization check before we remove the NA? We have more than 10,000 samples with NA.

C) for the formula:

predicted.data <- data.frame(

probability.of.hd=logistic$fitted.values,

Can I check with you if I were to apply to my model, are the following correct?

hd=data$hd)

A) Logistics: my model name

B) data: mydata (my data set)

C) hd: my response Y

D) fitted.values: keep it as fitness.values

D) Lastly I’m trying to do a confusion matrix of the variables, eg Sex & hd. Beside using xtabs, how to we check the Log odds ratio & p-value?

Pls share with us the r-script to find the better variables.

Thanks for your kind assistance Josh! 👍🏼😊

LikeLike

Josh,

Appreciate if you can enlighten me on my queries. Thanks!

LikeLike

I’d love to help, but I’m too busy with my work and my next StatQuest video. I hope you understand.

LikeLike