# Load necessary libraries
library(dplyr)
library(caret)
library(randomForest)
Project 1: Predictive Analytics for Healthcare
Data Science
Healthcare
Predictive Analytics
Overview
In this project, I developed a predictive analytics model to forecast patient readmissions in hospitals. The goal was to identify high-risk patients and implement early interventions to reduce readmission rates.
Key Features
- Data Cleaning and Preprocessing: Handling missing values, outliers, and data normalization.
- Model Development: Using logistic regression and random forest algorithms.
- Model Evaluation: Assessing model performance with metrics like accuracy, precision, recall, and ROC-AUC.
R Code Snippet
# Set seed for reproducibility
set.seed(123)
# Number of samples
<- 1000
n_samples
# Simulate data
<- sample(18:90, n_samples, replace = TRUE)
age <- sample(c('Male', 'Female'), n_samples, replace = TRUE)
gender <- sample(90:180, n_samples, replace = TRUE)
blood_pressure <- sample(150:300, n_samples, replace = TRUE)
cholesterol <- sample(c(0, 1), n_samples, replace = TRUE, prob = c(0.85, 0.15))
diabetes <- sample(c(0, 1), n_samples, replace = TRUE, prob = c(0.7, 0.3))
heart_disease <- sample(c(0, 1), n_samples, replace = TRUE, prob = c(0.8, 0.2))
readmission
# Create data frame
<- data.frame(
healthcare_data Age = age,
Gender = factor(gender, levels = c('Male', 'Female')),
Blood_Pressure = blood_pressure,
Cholesterol = cholesterol,
Diabetes = factor(diabetes, levels = c(0, 1)),
Heart_Disease = factor(heart_disease, levels = c(0, 1)),
Readmission = factor(readmission, levels = c(0, 1))
)
# Display the first few rows of the data
head(healthcare_data)
Age Gender Blood_Pressure Cholesterol Diabetes Heart_Disease Readmission
1 48 Female 163 167 0 0 1
2 68 Female 94 291 0 0 0
3 31 Female 164 266 0 0 0
4 84 Male 101 251 0 1 0
5 59 Female 94 192 1 1 0
6 67 Male 177 198 0 0 0
# Split data into training and testing sets
set.seed(123)
<- createDataPartition(healthcare_data$Readmission, p = 0.8, list = FALSE)
trainIndex <- healthcare_data[trainIndex, ]
trainData <- healthcare_data[-trainIndex, ]
testData
# Train a random forest model
<- randomForest(Readmission ~ ., data = trainData, ntree = 100)
rf_model
# Evaluate the model
<- predict(rf_model, testData)
predictions confusionMatrix(predictions, testData$Readmission)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 162 36
1 1 0
Accuracy : 0.8141
95% CI : (0.7529, 0.8656)
No Information Rate : 0.8191
P-Value [Acc > NIR] : 0.6157
Kappa : -0.0099
Mcnemar's Test P-Value : 2.276e-08
Sensitivity : 0.9939
Specificity : 0.0000
Pos Pred Value : 0.8182
Neg Pred Value : 0.0000
Prevalence : 0.8191
Detection Rate : 0.8141
Detection Prevalence : 0.9950
Balanced Accuracy : 0.4969
'Positive' Class : 0