Introduction


This R markdown will take you through the set of figures that I created during my last internship along with the data manipulation and cleaning that I used to accomplish my analysis.

Goal: Analyze maize leaf characteristics and their ability to discriminate its Genetic Background using Principal Component Analysis and Random Forest Classification.

Note: Most of the plots are interactable so make sure you hover over them.

Maize Leaf Data Cleaning and Manipulation


# Data Cleaning
library(tidyverse)
library(dplyr)
library(stringr)

# Data Viz
library(ggplot2)
library(ggiraph)

#Presentation
library(kableExtra)
library(DT)
library(patchwork)

# Classification
library(caret) # Cross Validation
library(caTools)
library(randomForest) 

Read in and Inspect the Data

# Read in dataset
all_shoots = read.csv("all_years_leafs_final.csv")

# Remove missing values
all_shoots = all_shoots[complete.cases(all_shoots), ]
datatable(
  head(all_shoots), 
  options = list(
    pageLength = 10,  # Number of rows to display
    searching = FALSE,  # Disable searching
    scrollX = TRUE,  # Enable horizontal scrolling
    autoWidth = TRUE  # Automatically adjust column width
  ),
  escape = FALSE
)

Data Manipulation and Cleaning

all_shoots2 = all_shoots %>%
  mutate(Year = as.factor(X)) %>% # Make year column into a factor
  select(-X) %>%
  # Convert Columns into Measurement Columns
  mutate(
    Blade_Width = as.numeric(Blade_Width),
    Blade_Length = as.numeric(Blade_Length),
    Surface_Area = as.numeric(Surface_Area),
    Sheath_Length = as.numeric(Sheath_Length),
    Genetic_Origin = str_replace(Genetic_Origin, "Stiff Stalk", "Stiff_Stalk")  
  )
leaf_counts_df = all_shoots2 %>%
  group_by(Leaf_No) %>%
  summarise(Count = n()) %>%
  arrange(desc(Count))

# Display the table using knitr::kable for a clean look
leaf_counts_df %>%
  kable(col.names = c("Leaf Number", "Number of Samples"),
        caption = "Number of Rows for Each Leaf No") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = F)
Number of Rows for Each Leaf No
Leaf Number Number of Samples
4 604
3 596
2 544
1 146
5 118

The Maize Leaf measurements were taken on leaves 1 through 5. However, as seen with the table above, the number of samples are imbalanced. For the purposes of this analysis, we will only look at leaves 3 and 4 since they have the most samples.

# Filter out leaves 1,2, 5 and widen dataset into leaf_no measurement pairs
all_shoots_wide = all_shoots2 %>%
  filter(Leaf_No == 4 | Leaf_No == 3) %>% # subset out leaves 4 and 3
  pivot_wider(
    names_from = Leaf_No, # Append Leaf_4 and Leaf_5 to the new columns
    values_from = c(Blade_Length, Blade_Width, Sheath_Length, Surface_Area), 
    names_glue = "{.value}_Leaf{Leaf_No}", 
    # Group by the following columns to account for duplicates
    id_cols = c(Year, Season, Genotype_ID, Treatment_ID, Genetic_Origin, Plant_ID),
    # for duplicate values, sum the the rows together
    values_fn = list(Blade_Length = sum, Blade_Width = sum, Sheath_Length = sum, Surface_Area = sum) 
    ) %>%
  select(-Plant_ID) %>% # Get rid of PLant_ID column
  drop_na() # Remove rows that only have Leaf 3 data or only have Leaf 4 data

Using leaves three and four, I pivot the data frame wider so that there is a column for each leaf and measurement pair. An example is show below:

Pivot Wider Example

Maize Leaf Data Analysis


Principle Component Analysis

High dimensional data can be difficult to interpret and visualize. Principle Component Analyis allows us to reduce the dimensionality of the data so that we can look for patterns while sill retaining most of the information.

numeric_data = all_shoots_wide[, c(7:ncol(all_shoots_wide))]

scaled_data = scale(numeric_data) #make the entire dataset on the same scale
fit = prcomp(scaled_data, center=T, scale=T) #conduct the PCA
# Plot first two principle components
comp_leafs = data.frame(fit$x[,1:2])
comp_leafs$Type = all_shoots_wide$Genetic_Origin

# Plot first two principle components and color by Genetic Origin
p = ggplot(comp_leafs, aes(x = PC1, y = PC2, 
                           color = Type, data_id = Type, 
                           tooltip = paste0("Class: ", Type))) +
  geom_point_interactive(shape = 19, size = 2, alpha = 0.6) +
  stat_ellipse(size = 1) +    # Customize ellipse
  labs(x = "Principal Component 1", 
       y = "Principal Component 2", 
       title = "PCA Plot of Leaf 3 & 4 Data", 
       subtitle = "Colored by Genetic Origin") +  # Add subtitle
  theme_minimal(base_size = 15) +                  # Set base font size
  theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 18),   # Center title
        plot.subtitle = element_text(hjust = 0.5, size = 14),  # Center subtitle
        legend.position = "right",                 # Adjust legend position
        legend.title = element_text(face = "bold"), 
        legend.background = element_rect(fill = "gray95", size = 0.5, linetype = "solid"),
        panel.grid.major = element_line(color = "gray80", size = 0.5),  # Subtle grid lines
        panel.grid.minor = element_blank(),        # Remove minor grid lines
        )

interactive_plot <- girafe(
  ggobj = p,
  width_svg = 10,
  height_svg = 7,
  options = list(
    opts_hover(css = "stroke:blue; stroke-width:2px; fill-opacity:0.7;"),  # Highlight hovered points
    opts_hover_key(css = "stroke:blue; stroke-width:2px; fill-opacity:0.7;")  # Highlight entire group on hover
  )
)

interactive_plot

From this plot, we can see the clusters of Genetic Origin have a lot of overlap and there aren’t any clear patterns. Lets color by another variable that may be explaining the variation.

# Plot first two principle components
comp_leafs = data.frame(fit$x[,1:2])
comp_leafs$Year = all_shoots_wide$Year

# Plot first two principle components and color by Genetic Origin
p = ggplot(comp_leafs, aes(x = PC1, y = PC2, 
                           color = Year, data_id = Year, 
                           tooltip = paste0("Year: ", Year))) +
  geom_point_interactive(shape = 19, size = 2, alpha = 0.6) +
  stat_ellipse(size = 1) +    # Customize ellipse
  labs(x = "Principal Component 1", 
       y = "Principal Component 2", 
       title = "PCA Plot of Leaf 3 & 4 Data", 
       subtitle = "Colored by Year") +  # Add subtitle
  theme_minimal(base_size = 15) +                  # Set base font size
  theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 18),   # Center title
        plot.subtitle = element_text(hjust = 0.5, size = 14),  # Center subtitle
        legend.position = "right",                 # Adjust legend position
        legend.title = element_text(face = "bold"), 
        legend.background = element_rect(fill = "gray95", size = 0.5, linetype = "solid"),
        panel.grid.major = element_line(color = "gray80", size = 0.5),  # Subtle grid lines
        panel.grid.minor = element_blank(),        # Remove minor grid lines
        )

interactive_plot <- girafe(
  ggobj = p,
  width_svg = 10,
  height_svg = 7,
  options = list(
    opts_hover(css = "stroke:blue; stroke-width:2px; fill-opacity:0.7;"),  # Highlight hovered points
    opts_hover_key(css = "stroke:blue; stroke-width:2px; fill-opacity:0.7;")  # Highlight entire group on hover
  )
)

interactive_plot

When coloring by Year, the clusters are much more distinct. The separation of the data seems to be driven by the environmental conditions that differ across years. Lets normalize the data by the Control to account for this and then re-run the analysis

Normalize Data By Control Genotype B73

# Normalize the shoot data by subtracting by the mean of Control Genotype B73 in each Year, Season, Treatment Group
normalized_shoots = all_shoots_wide %>%
  group_by(Year, Treatment_ID, Season) %>%
    # Mutate the columns in the widened dataset
  # First calculate the mean of each Year, Season, B73 group
  # Then, subtract each row in a group by the mean for that group
  mutate(across(where(is.numeric),
                ~ (. - mean(. [Genotype_ID == "B73"], na.rm = TRUE)))) %>%
  ungroup() %>%
  drop_na()
# Re-run the analysis with normalized data

numeric_data_normalized = normalized_shoots[, c(7:ncol(normalized_shoots))]

scaled_data = scale(numeric_data_normalized) #make the entire dataset on the same scale
fit = prcomp(scaled_data, center=T, scale=T) #conduct the PCA
# Plot first two principle components
comp_leafs = data.frame(fit$x[,1:2])
comp_leafs$Type = all_shoots_wide$Genetic_Origin

# Plot first two principle components and color by Genetic Origin
p = ggplot(comp_leafs, aes(x = PC1, y = PC2, 
                           color = Type, data_id = Type, 
                           tooltip = paste0("Class: ", Type))) +
  geom_point_interactive(shape = 19, size = 2, alpha = 0.6) +
  stat_ellipse(size = 1) +    # Customize ellipse
  labs(x = "Principal Component 1", 
       y = "Principal Component 2", 
       title = "PCA Plot of Leaf 3 & 4 Data (Normalized)", 
       subtitle = "Colored by Genetic Origin") +  # Add subtitle
  theme_minimal(base_size = 15) +                  # Set base font size
  theme(plot.title = element_text(face = "bold", hjust = 0.5, size = 18),   # Center title
        plot.subtitle = element_text(hjust = 0.5, size = 14),  # Center subtitle
        legend.position = "right",                 # Adjust legend position
        legend.title = element_text(face = "bold"), 
        legend.background = element_rect(fill = "gray95", size = 0.5, linetype = "solid"),
        panel.grid.major = element_line(color = "gray80", size = 0.5),  # Subtle grid lines
        panel.grid.minor = element_blank(),        # Remove minor grid lines
        )

interactive_plot <- girafe(
  ggobj = p,
  width_svg = 10,
  height_svg = 7,
  options = list(
    opts_hover(css = "stroke:blue; stroke-width:2px; fill-opacity:0.7;"),  # Highlight hovered points
    opts_hover_key(css = "stroke:blue; stroke-width:2px; fill-opacity:0.7;")  # Highlight entire group on hover
  )
)

interactive_plot

After normalizing the data to reduce the effect of distribution shift seen across years, we re-ran the analysis and see much more distinct clusters. In this plot, we can see that Stiff Stalk has the least variation and Popcorn is also very distinct.

Random Forest To Predict Genetic Origin

Now that we have seen how the high dimensional leaf data is clustered, lets see if our visual observations match up with machine learning model results.

A random Forest model with 500 trees and 30 k-Fold cross validation will be used. An 80% Train and 20% Test split will be used to evaluate the ability of the model to generalize to new data.

# Create dataframe with numeric measurements and mutate on desired class column

classification_shoots = numeric_data_normalized %>%
  mutate(Class = as.factor(make.names(normalized_shoots$Genetic_Origin)))
data = classification_shoots
# Splitting data into train and test set
set.seed(120)  # Setting seed 

split = sample.split(data$Class, SplitRatio = 0.80)
train = subset(data, split == TRUE)
test = subset(data, split == FALSE)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, : There were missing
## values in resampled performance measures.
# Calculate overall accuracy
correct_predictions = sum(diag(confusion_mtx))
total_instances = sum(confusion_mtx)
accuracy = correct_predictions / total_instances

# Print accuracy
print(accuracy)
## [1] 0.8220339
# Confusion Matrix 
confusion_mtx = confusionMatrix(as.factor(y_pred), as.factor(test$Class))

# # Extract the confusion matrix table
confusion_table = as.data.frame(confusion_mtx$table)

confusion_table$Prediction = factor(confusion_table$Prediction, levels=rev(levels(confusion_table$Prediction)))

p_matrix = ggplot(confusion_table, aes(Prediction, Reference, fill = Freq)) +
  geom_tile() + 
  geom_text(aes(label = Freq)) +
  scale_fill_gradient(low = "white", high = "#009194") +
  labs(x = "Reference", y = "Prediction", title = "Confusion Matrix of Predictions on Held Out Test Set") +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 18),  # Center title
    axis.title.x = element_text(face = "bold", size = 16),  # Bold axis titles
    axis.title.y = element_text(face = "bold", size = 16),
    axis.text.x = element_text(angle = 45, hjust = 1, face = "bold"),  # Rotate x-axis labels
    axis.text.y = element_text(face = "bold")
  )
  
p_matrix

To evaluate the model, this confusion matrix displays how many samples were correctly predicted in each Genetic Origin, giving an idea of which classes performed better or worse. The best performing class was Non-Stiff-Stalk with only 2 incorrect predictions out of 24.

# Calculate class errors
class_errors <- confusion_table %>%
  group_by(Reference) %>%
  summarize(Total = sum(Freq),
            Errors = sum(Freq[Prediction != Reference]),
            ErrorRate = Errors / Total)

# Create an interactive bar chart with gradient color scale
p_error <- ggplot(class_errors, 
            aes(x = Reference, y = ErrorRate, 
                fill = ErrorRate,  # Map fill to ErrorRate
                tooltip = paste0("Class: ", Reference, "<br>",
                                 "Error Rate: ", scales::percent(ErrorRate, accuracy = 0.1)), 
                data_id = Reference)) +
  geom_bar_interactive(stat = "identity", color = "black") +
  scale_fill_gradient(low = "white", high = "red") +  # Gradient from white to red
  labs(x = "Class", y = "Error Rate", title = "Class Error Rates") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +  # Display y-axis as percentage
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 18),  # Center title
    axis.title.x = element_text(face = "bold", size = 16),  # Bold axis titles
    axis.title.y = element_text(face = "bold", size = 16),
    axis.text.x = element_text(angle = 45, hjust = 1, face = "bold"),  # Rotate x-axis labels
    axis.text.y = element_text(face = "bold")
  )

# Render the interactive plot with ggiraph
interactive_plot = girafe(ggobj = p_error, 
                           width_svg = 10,
                           height_svg = 7)

interactive_plot = girafe_options(
  interactive_plot,
  opts_hover(css = "fill:Blue;stroke:black;")
)

interactive_plot

This graph displays the class errors for each Genetic Origin. As seen with the confusion matrix, Mixed has the highest error while Non-Stiff-Stalk has the least. This could be due to the fact that Mixed does not have as many samples as the other classes along with being a combination of all the other Genetic Origins. As a result, the phenotypic measurements vary widely and can get mixed up with other classes.

girafe(
  ggobj = p / p_error,
  width_svg = 10, height_svg = 10
) %>% 
  girafe_options(
    opts_hover(css = "fill:black;")
  )

This last plot allows you to see the relationship between class error and its variance / number of data points. Hover over a bar to see the corresponding cluster.