This R markdown will take you through the set of figures that I created during my last internship along with the data manipulation and cleaning that I used to accomplish my analysis.
Goal: Analyze maize leaf characteristics and their ability to discriminate its Genetic Background using Principal Component Analysis and Random Forest Classification.
Note: Most of the plots are interactable so make sure you hover over them.
# Data Cleaning
library(tidyverse)
library(dplyr)
library(stringr)
# Data Viz
library(ggplot2)
library(ggiraph)
#Presentation
library(kableExtra)
library(DT)
library(patchwork)
# Classification
library(caret) # Cross Validation
library(caTools)
library(randomForest)
# Read in dataset
all_shoots = read.csv("all_years_leafs_final.csv")
# Remove missing values
all_shoots = all_shoots[complete.cases(all_shoots), ]
datatable(
head(all_shoots),
options = list(
pageLength = 10, # Number of rows to display
searching = FALSE, # Disable searching
scrollX = TRUE, # Enable horizontal scrolling
autoWidth = TRUE # Automatically adjust column width
),
escape = FALSE
)
all_shoots2 = all_shoots %>%
mutate(Year = as.factor(X)) %>% # Make year column into a factor
select(-X) %>%
# Convert Columns into Measurement Columns
mutate(
Blade_Width = as.numeric(Blade_Width),
Blade_Length = as.numeric(Blade_Length),
Surface_Area = as.numeric(Surface_Area),
Sheath_Length = as.numeric(Sheath_Length),
Genetic_Origin = str_replace(Genetic_Origin, "Stiff Stalk", "Stiff_Stalk")
)
leaf_counts_df = all_shoots2 %>%
group_by(Leaf_No) %>%
summarise(Count = n()) %>%
arrange(desc(Count))
# Display the table using knitr::kable for a clean look
leaf_counts_df %>%
kable(col.names = c("Leaf Number", "Number of Samples"),
caption = "Number of Rows for Each Leaf No") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = F)
Leaf Number | Number of Samples |
---|---|
4 | 604 |
3 | 596 |
2 | 544 |
1 | 146 |
5 | 118 |
The Maize Leaf measurements were taken on leaves 1 through 5. However, as seen with the table above, the number of samples are imbalanced. For the purposes of this analysis, we will only look at leaves 3 and 4 since they have the most samples.
# Filter out leaves 1,2, 5 and widen dataset into leaf_no measurement pairs
all_shoots_wide = all_shoots2 %>%
filter(Leaf_No == 4 | Leaf_No == 3) %>% # subset out leaves 4 and 3
pivot_wider(
names_from = Leaf_No, # Append Leaf_4 and Leaf_5 to the new columns
values_from = c(Blade_Length, Blade_Width, Sheath_Length, Surface_Area),
names_glue = "{.value}_Leaf{Leaf_No}",
# Group by the following columns to account for duplicates
id_cols = c(Year, Season, Genotype_ID, Treatment_ID, Genetic_Origin, Plant_ID),
# for duplicate values, sum the the rows together
values_fn = list(Blade_Length = sum, Blade_Width = sum, Sheath_Length = sum, Surface_Area = sum)
) %>%
select(-Plant_ID) %>% # Get rid of PLant_ID column
drop_na() # Remove rows that only have Leaf 3 data or only have Leaf 4 data
Using leaves three and four, I pivot the data frame wider so that there is a column for each leaf and measurement pair. An example is show below:
Pivot Wider Example
High dimensional data can be difficult to interpret and visualize. Principle Component Analyis allows us to reduce the dimensionality of the data so that we can look for patterns while sill retaining most of the information.
numeric_data = all_shoots_wide[, c(7:ncol(all_shoots_wide))]
scaled_data = scale(numeric_data) #make the entire dataset on the same scale
fit = prcomp(scaled_data, center=T, scale=T) #conduct the PCA
# Plot first two principle components
comp_leafs = data.frame(fit$x[,1:2])
comp_leafs$Type = all_shoots_wide$Genetic_Origin
# Plot first two principle components and color by Genetic Origin
p = ggplot(comp_leafs, aes(x = PC1, y = PC2,
color = Type, data_id = Type,
tooltip = paste0("Class: ", Type))) +
geom_point_interactive(shape = 19, size = 2, alpha = 0.6) +
stat_ellipse(size = 1) + # Customize ellipse
labs(x = "Principal Component 1",
y = "Principal Component 2",
title = "PCA Plot of Leaf 3 & 4 Data",
subtitle = "Colored by Genetic Origin") + # Add subtitle
theme_minimal(base_size = 15) + # Set base font size
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 18), # Center title
plot.subtitle = element_text(hjust = 0.5, size = 14), # Center subtitle
legend.position = "right", # Adjust legend position
legend.title = element_text(face = "bold"),
legend.background = element_rect(fill = "gray95", size = 0.5, linetype = "solid"),
panel.grid.major = element_line(color = "gray80", size = 0.5), # Subtle grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
)
interactive_plot <- girafe(
ggobj = p,
width_svg = 10,
height_svg = 7,
options = list(
opts_hover(css = "stroke:blue; stroke-width:2px; fill-opacity:0.7;"), # Highlight hovered points
opts_hover_key(css = "stroke:blue; stroke-width:2px; fill-opacity:0.7;") # Highlight entire group on hover
)
)
interactive_plot
From this plot, we can see the clusters of Genetic Origin have a lot of overlap and there aren’t any clear patterns. Lets color by another variable that may be explaining the variation.
# Plot first two principle components
comp_leafs = data.frame(fit$x[,1:2])
comp_leafs$Year = all_shoots_wide$Year
# Plot first two principle components and color by Genetic Origin
p = ggplot(comp_leafs, aes(x = PC1, y = PC2,
color = Year, data_id = Year,
tooltip = paste0("Year: ", Year))) +
geom_point_interactive(shape = 19, size = 2, alpha = 0.6) +
stat_ellipse(size = 1) + # Customize ellipse
labs(x = "Principal Component 1",
y = "Principal Component 2",
title = "PCA Plot of Leaf 3 & 4 Data",
subtitle = "Colored by Year") + # Add subtitle
theme_minimal(base_size = 15) + # Set base font size
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 18), # Center title
plot.subtitle = element_text(hjust = 0.5, size = 14), # Center subtitle
legend.position = "right", # Adjust legend position
legend.title = element_text(face = "bold"),
legend.background = element_rect(fill = "gray95", size = 0.5, linetype = "solid"),
panel.grid.major = element_line(color = "gray80", size = 0.5), # Subtle grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
)
interactive_plot <- girafe(
ggobj = p,
width_svg = 10,
height_svg = 7,
options = list(
opts_hover(css = "stroke:blue; stroke-width:2px; fill-opacity:0.7;"), # Highlight hovered points
opts_hover_key(css = "stroke:blue; stroke-width:2px; fill-opacity:0.7;") # Highlight entire group on hover
)
)
interactive_plot
When coloring by Year, the clusters are much more distinct. The separation of the data seems to be driven by the environmental conditions that differ across years. Lets normalize the data by the Control to account for this and then re-run the analysis
# Normalize the shoot data by subtracting by the mean of Control Genotype B73 in each Year, Season, Treatment Group
normalized_shoots = all_shoots_wide %>%
group_by(Year, Treatment_ID, Season) %>%
# Mutate the columns in the widened dataset
# First calculate the mean of each Year, Season, B73 group
# Then, subtract each row in a group by the mean for that group
mutate(across(where(is.numeric),
~ (. - mean(. [Genotype_ID == "B73"], na.rm = TRUE)))) %>%
ungroup() %>%
drop_na()
# Re-run the analysis with normalized data
numeric_data_normalized = normalized_shoots[, c(7:ncol(normalized_shoots))]
scaled_data = scale(numeric_data_normalized) #make the entire dataset on the same scale
fit = prcomp(scaled_data, center=T, scale=T) #conduct the PCA
# Plot first two principle components
comp_leafs = data.frame(fit$x[,1:2])
comp_leafs$Type = all_shoots_wide$Genetic_Origin
# Plot first two principle components and color by Genetic Origin
p = ggplot(comp_leafs, aes(x = PC1, y = PC2,
color = Type, data_id = Type,
tooltip = paste0("Class: ", Type))) +
geom_point_interactive(shape = 19, size = 2, alpha = 0.6) +
stat_ellipse(size = 1) + # Customize ellipse
labs(x = "Principal Component 1",
y = "Principal Component 2",
title = "PCA Plot of Leaf 3 & 4 Data (Normalized)",
subtitle = "Colored by Genetic Origin") + # Add subtitle
theme_minimal(base_size = 15) + # Set base font size
theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 18), # Center title
plot.subtitle = element_text(hjust = 0.5, size = 14), # Center subtitle
legend.position = "right", # Adjust legend position
legend.title = element_text(face = "bold"),
legend.background = element_rect(fill = "gray95", size = 0.5, linetype = "solid"),
panel.grid.major = element_line(color = "gray80", size = 0.5), # Subtle grid lines
panel.grid.minor = element_blank(), # Remove minor grid lines
)
interactive_plot <- girafe(
ggobj = p,
width_svg = 10,
height_svg = 7,
options = list(
opts_hover(css = "stroke:blue; stroke-width:2px; fill-opacity:0.7;"), # Highlight hovered points
opts_hover_key(css = "stroke:blue; stroke-width:2px; fill-opacity:0.7;") # Highlight entire group on hover
)
)
interactive_plot
After normalizing the data to reduce the effect of distribution shift seen across years, we re-ran the analysis and see much more distinct clusters. In this plot, we can see that Stiff Stalk has the least variation and Popcorn is also very distinct.
Now that we have seen how the high dimensional leaf data is clustered, lets see if our visual observations match up with machine learning model results.
A random Forest model with 500 trees and 30 k-Fold cross validation will be used. An 80% Train and 20% Test split will be used to evaluate the ability of the model to generalize to new data.
# Create dataframe with numeric measurements and mutate on desired class column
classification_shoots = numeric_data_normalized %>%
mutate(Class = as.factor(make.names(normalized_shoots$Genetic_Origin)))
data = classification_shoots
# Splitting data into train and test set
set.seed(120) # Setting seed
split = sample.split(data$Class, SplitRatio = 0.80)
train = subset(data, split == TRUE)
test = subset(data, split == FALSE)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, : There were missing
## values in resampled performance measures.
# Calculate overall accuracy
correct_predictions = sum(diag(confusion_mtx))
total_instances = sum(confusion_mtx)
accuracy = correct_predictions / total_instances
# Print accuracy
print(accuracy)
## [1] 0.8220339
# Confusion Matrix
confusion_mtx = confusionMatrix(as.factor(y_pred), as.factor(test$Class))
# # Extract the confusion matrix table
confusion_table = as.data.frame(confusion_mtx$table)
confusion_table$Prediction = factor(confusion_table$Prediction, levels=rev(levels(confusion_table$Prediction)))
p_matrix = ggplot(confusion_table, aes(Prediction, Reference, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq)) +
scale_fill_gradient(low = "white", high = "#009194") +
labs(x = "Reference", y = "Prediction", title = "Confusion Matrix of Predictions on Held Out Test Set") +
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 18), # Center title
axis.title.x = element_text(face = "bold", size = 16), # Bold axis titles
axis.title.y = element_text(face = "bold", size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, face = "bold"), # Rotate x-axis labels
axis.text.y = element_text(face = "bold")
)
p_matrix
To evaluate the model, this confusion matrix displays how many samples
were correctly predicted in each Genetic Origin, giving an idea of which
classes performed better or worse. The best performing class was
Non-Stiff-Stalk with only 2 incorrect predictions out of 24.
# Calculate class errors
class_errors <- confusion_table %>%
group_by(Reference) %>%
summarize(Total = sum(Freq),
Errors = sum(Freq[Prediction != Reference]),
ErrorRate = Errors / Total)
# Create an interactive bar chart with gradient color scale
p_error <- ggplot(class_errors,
aes(x = Reference, y = ErrorRate,
fill = ErrorRate, # Map fill to ErrorRate
tooltip = paste0("Class: ", Reference, "<br>",
"Error Rate: ", scales::percent(ErrorRate, accuracy = 0.1)),
data_id = Reference)) +
geom_bar_interactive(stat = "identity", color = "black") +
scale_fill_gradient(low = "white", high = "red") + # Gradient from white to red
labs(x = "Class", y = "Error Rate", title = "Class Error Rates") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + # Display y-axis as percentage
theme(
plot.title = element_text(face = "bold", hjust = 0.5, size = 18), # Center title
axis.title.x = element_text(face = "bold", size = 16), # Bold axis titles
axis.title.y = element_text(face = "bold", size = 16),
axis.text.x = element_text(angle = 45, hjust = 1, face = "bold"), # Rotate x-axis labels
axis.text.y = element_text(face = "bold")
)
# Render the interactive plot with ggiraph
interactive_plot = girafe(ggobj = p_error,
width_svg = 10,
height_svg = 7)
interactive_plot = girafe_options(
interactive_plot,
opts_hover(css = "fill:Blue;stroke:black;")
)
interactive_plot
This graph displays the class errors for each Genetic Origin. As seen with the confusion matrix, Mixed has the highest error while Non-Stiff-Stalk has the least. This could be due to the fact that Mixed does not have as many samples as the other classes along with being a combination of all the other Genetic Origins. As a result, the phenotypic measurements vary widely and can get mixed up with other classes.
girafe(
ggobj = p / p_error,
width_svg = 10, height_svg = 10
) %>%
girafe_options(
opts_hover(css = "fill:black;")
)
This last plot allows you to see the relationship between class error and its variance / number of data points. Hover over a bar to see the corresponding cluster.