Differential Expression of ERBB2 Gene in Pikachu Data


Tanishk Sinha
I am Tanishk. Nice to meet you.

Differential Expression of ERBB2 Gene in Pikachu Data

Figure Description

I switched from the Eevee dataset to the Pikachu dataset. As a result of the elbow plot, I judged that the optimal k for k-means clustering of the TSNE data was better at k = 6 rather than k = 4 for the Eevee dataset. This may be because there are more transcriptionally distinct cell-clusters in the cell-based Pikachu dataset. That may be because with the spot-based Eevee dataset, several cells may blend together as the resolution of the spot is larger than a cell. However, the cell-annotated data has a higher individual cell resolution. Furthermore, in order to find the cell type that was most similar to the one I found in the Eevee dataset, I performed calculations for each Pikachu cluster compared to the Eevee cluster using a two-sided t test to find the Pikachu cluster that was least different from the Eevee cluster. I then inserted the code from HW4 without much issue for creating the graphs, besides decreasing the spot size for the TSNE and physical location figures.

data <- read.csv('~/OneDrive/Documents/School/Johns Hopkins/Spring 2024/eevee.csv.gz', row.names=1)
data[1:5,1:5]
pos <- data[,2:3]
gexp <- data[, 4:ncol(data)]

topgene <- names(sort(apply(gexp, 2, var), decreasing=TRUE)[1:1000]) 
gexp <- gexpfilter <- gexp[,topgene]

gexpnorm <- log10(gexp/rowSums(gexp) * mean(rowSums(gexp))+1)

pcs <- prcomp(gexpnorm)

# tsne
library(Rtsne)
emb <- Rtsne(pcs$x[,1:20])$Y # faster than tSNE on genes

library(purrr)

tot_withinss <- map_dbl(1:10,  function(k){
  model <- kmeans(x = emb, centers = k)
  model$tot.withinss
})
# Generate a data frame containing both k and tot_withinss
elbow_df <- data.frame(
  k = 1:10,
  tot_withinss = tot_withinss
)

library(ggplot2)
# Plot the elbow plot -> chose k = 4 https://www.r-bloggers.com/2020/05/how-to-determine-the-number-of-clusters-for-k-means-in-r/
elbow <- ggplot(elbow_df, aes(x = k, y = tot_withinss)) +
  geom_line() + geom_point()+
  scale_x_continuous(breaks = 1:10)

com <- as.factor(kmeans(gexpnorm, centers=4)$cluster)

#plot tsne with clusters
p1 <- ggplot(data.frame(emb, com)) + 
  geom_point(aes(x = X1, y = X2, col=com), size=1) + 
  theme_bw()

#plot location of clusters
position <- data.frame(pos, com)

physical <- ggplot(position) + 
  geom_point(aes(x = aligned_x, 
                 y=aligned_y, 
                 col= position$com == 4), 
             size=1) +
  theme_minimal()

#finding differentially expressed genes
gexpnorm$cluster <- com

results <- sapply(colnames(gexp), function(g) {
  t.test(gexpnorm[com == 4, g], 
         gexpnorm[com != 4, g], 
         alternative = "greater")$p.val
})
results

# find p values https://r-graph-gallery.com/218-basic-barplots-with-ggplot2.html
top10p <- sort(results, decreasing = FALSE)[1:10]

top10p 

top10genes <- rownames(data.frame(top10p))

genes <- data.frame(top10p, top10genes)

#https://guslipkin.medium.com/reordering-bar-and-column-charts-with-ggplot2-in-r-435fad1c643e
#plot p values of top 10 upregulated genes
top10_genes <- reorder(top10genes, top10p)

p_values <- ggplot(genes, aes(x= top10_genes, y = top10p)) + geom_bar(stat='identity')

c4genes <- data.frame(gexpnorm[com == 4,][, top10genes])

nonc4genes <- data.frame(gexpnorm[com != 4,][, top10genes])

#https://www.geeksforgeeks.org/how-to-find-mean-of-dataframe-column-in-r/
#cluster 4 gene expressions
avg <- sapply(1:10, function(g) {
  mean(c4genes[, g])
})

#non cluster 4 gene expressions
nonavg <- sapply(1:10, function(g) {
  mean(nonc4genes[, g])
})

# https://www.tutorialspoint.com/how-to-create-a-replicated-list-of-a-list-in-r#:~:text=The%20replication%20of%20list%20of,(x)%2C5).
# https://www.digitalocean.com/community/tutorials/r-melt-and-cast-function

df1 <- data.frame(Cluster_4 = avg, Cluster_1to3 = nonavg)

df1 <- as.data.frame(melt(df1))
df1_new <- data.frame(df1, rep(colnames(c4genes), 2))
colnames(df1_new) <- c("Cluster", "Expression", "Gene")

# plot to display upregulated genes
upreg_genes <- ggplot(df1_new, aes(x = Gene, y = Cluster, fill = Expression)) +
  geom_tile() + scale_fill_gradient(low="white", high="red")

#choose gene APOD
apod_tsne <- ggplot(data.frame(emb, com, gexpnorm['APOD'])) +
  geom_point(aes(x = X1, y = X2, col = APOD, shape = com == 4), size=2) +
  scale_color_gradient(low="white", high="red")

apod_physical <- ggplot(data.frame(position, gexpnorm['APOD'])) + 
  geom_point(aes(x = aligned_x, y=aligned_y, col= APOD, shape = position$com == 4), 
             size=2) + scale_color_gradient(low="white", high="red")

# https://patchwork.data-imaginist.com/articles/guides/annotation.html
library(patchwork)










# DATA FOR PIKACHU
data <- read.csv('~/OneDrive/Documents/School/Johns Hopkins/Spring 2024/pikachu.csv.gz', row.names=1)
#data parsing changed
data[1:5,1:5]
pos <- data[, 4:5]
gexp <- data[, 6:ncol(data)]
rownames(pos) <- rownames(gexp) <- data$cell_id
cell_area <- data$cell_area
names(cell_area) <- data$cell_id
head(cell_area)

gexpnorm_p <- log10(gexp/rowSums(gexp) * mean(rowSums(gexp))+1)

orig_columns <- colnames(gexpnorm_p)

pcs <- prcomp(gexpnorm_p)

# tsne
library(Rtsne)
emb <- Rtsne(pcs$x[,1:20])$Y # faster than tSNE on genes

library(purrr)

tot_withinss <- map_dbl(1:20,  function(k){
  model <- kmeans(x = emb, centers = k)
  model$tot.withinss
})
# increased number of k
# Generate a data frame containing both k and tot_withinss
elbow_df <- data.frame(
  k = 1:20,
  tot_withinss = tot_withinss
)

library(ggplot2)
# Plot the elbow plot -> changed and chose k = 20 https://www.r-bloggers.com/2020/05/how-to-determine-the-number-of-clusters-for-k-means-in-r/
elbow <- ggplot(elbow_df, aes(x = k, y = tot_withinss)) +
  geom_line() + geom_point()+
  scale_x_continuous(breaks = 1:20)

#chose k = 6

elbow

com <- as.factor(kmeans(gexpnorm_p, centers=6)$cluster)

#plot tsne with clusters
p1 <- ggplot(data.frame(emb, com)) + 
  geom_point(aes(x = X1, y = X2, col=com), size=1) + 
  theme_bw()

p1

#plot physical location of clusters
position <- data.frame(pos, com)

#I changed the size of points to represent cells
physical <- ggplot(position) + 
  geom_point(aes(x = aligned_x, 
                 y=aligned_y, 
                 col= position$com == 1), 
             size=.25) +
  theme_minimal()

physical

#finding differentially expressed genes
gexpnorm_p$cluster <- com

#find common genes
names <- intersect(colnames(gexpnorm_p), colnames(gexpnorm))

gexpnorm_p_names <- gexpnorm_p[,names]

gexpnorm_names <- gexpnorm[,names]

names <- names[1:(length(names)-1)]

# new p value loop
# evidently, cluster 1 of the pikachu data 

results1 <- sapply(names, function(g) {
  t.test(gexpnorm_p_names[gexpnorm_p_names$cluster == 1, g], 
         gexpnorm_names[gexpnorm_names$cluster == 4, g])$p.val
})

results2 <- sapply(names, function(g) {
  t.test(gexpnorm_p_names[gexpnorm_p_names$cluster == 2, g], 
         gexpnorm_names[gexpnorm_names$cluster == 4, g])$p.val
})

results3 <- sapply(names, function(g) {
  t.test(gexpnorm_p_names[gexpnorm_p_names$cluster == 3, g], 
         gexpnorm_names[gexpnorm_names$cluster == 4, g])$p.val
})

results4 <- sapply(names, function(g) {
  t.test(gexpnorm_p_names[gexpnorm_p_names$cluster == 4, g], 
         gexpnorm_names[gexpnorm_names$cluster == 4, g])$p.val
})

results5 <- sapply(names, function(g) {
  t.test(gexpnorm_p_names[gexpnorm_p_names$cluster == 5, g], 
         gexpnorm_names[gexpnorm_names$cluster == 4, g])$p.val
})

results6 <- sapply(names, function(g) {
  t.test(gexpnorm_p_names[gexpnorm_p_names$cluster == 6, g], 
         gexpnorm_names[gexpnorm_names$cluster == 4, g])$p.val
})

results <- data.frame(results1 = results1, results2 = results2, 
                      results3 = results3, results4 = results4,
                      results5 = results5, results6 = results6)

colMeans(results, na.rm = TRUE)

#RETURN TO CODE -> choose cluster with lowest p value 

clus <- 1

results_p <- sapply(orig_columns, function(g) {
  t.test(gexpnorm_p[com == clus, g], 
         gexpnorm_p[com != clus, g], 
         alternative = "greater")$p.val
})
results_p

library(reshape2)

# find p values https://r-graph-gallery.com/218-basic-barplots-with-ggplot2.html
top10p <- sort(results_p, decreasing = FALSE)[1:10]

top10genes <- rownames(data.frame(top10p))

genes <- data.frame(top10p, top10genes)

#https://guslipkin.medium.com/reordering-bar-and-column-charts-with-ggplot2-in-r-435fad1c643e
#plot p values of top 10 upregulated genes
top10_genes <- reorder(top10genes, top10p)

p_values <- ggplot(genes, aes(x= top10_genes, y = top10p)) + geom_bar(stat='identity')

c4genes <- data.frame(gexpnorm_p[com == clus,][, top10genes])

nonc4genes <- data.frame(gexpnorm_p[com != clus,][, top10genes])

#https://www.geeksforgeeks.org/how-to-find-mean-of-dataframe-column-in-r/
#cluster of interest gene expressions
avg <- sapply(1:10, function(g) {
  mean(c4genes[, g])
})

#non cluster of interest gene expressions
nonavg <- sapply(1:10, function(g) {
  mean(nonc4genes[, g])
})

nonavg

# https://www.tutorialspoint.com/how-to-create-a-replicated-list-of-a-list-in-r#:~:text=The%20replication%20of%20list%20of,(x)%2C5).
# https://www.digitalocean.com/community/tutorials/r-melt-and-cast-function

df1 <- data.frame(Cluster_4 = avg, Cluster_1to3 = nonavg)

df1 <- as.data.frame(melt(df1))

df1_new <- data.frame(df1, rep(colnames(c4genes), 2))
df1_new

colnames(df1_new) <- c("Cluster", "Expression", "Gene")

# plot to display upregulated genes
upreg_genes <- ggplot(df1_new, aes(x = Gene, y = Cluster, fill = Expression)) +
  geom_tile() + scale_fill_gradient(low="white", high="red")

#choose gene of interest
com

gene <- "ERBB2"

rnase_tsne <- ggplot(data.frame(emb, com, gexpnorm_p['ERBB2'])) +
  geom_point(aes(x = X1, y = X2, col = ERBB2, shape = com == clus), size=2) +
  scale_color_gradient(low="white", high="red")

rnase_tsne

rnase_physical <- ggplot(data.frame(position, gexpnorm_p['ERBB2'])) + 
  geom_point(aes(x = aligned_x, y=aligned_y, col= ERBB2, shape = position$com == clus), 
             size=.5) + scale_color_gradient(low="white", high="red")

rnase_physical

# https://patchwork.data-imaginist.com/articles/guides/annotation.html
library(patchwork)

# https://www.datanovia.com/en/blog/ggplot-axis-ticks-set-and-rotate-text-labels/#:~:text=geom_boxplot()%20p-,Change%20axis%20tick%20mark%20labels,to%20rotate%20the%20tick%20text.
elbow <- elbow + ggtitle("Elbow Plot for PCs")
p1 <- p1 + ggtitle("KMeans Clustering of TSNE")
physical <- physical + ggtitle("Location of Cluster 1 Barcodes")
p_values <- p_values + ggtitle("P-Values for Cluster 1 Upregulated Genes") + theme(axis.text.x = element_text(angle = 90))
rnase_tsne <- rnase_tsne + ggtitle("Expression of ERBB2 Gene in TSNE")
rnase_physical <- rnase_physical + ggtitle("Expression of ERBB2 Gene in Barcodes")


elbow + p1 + physical + p_values + rnase_tsne + rnase_physical + plot_annotation(tag_levels = '1')