# Code to load data, build and display heatmap of supervised cluster
# Author: Laurie Heyer, tweaked by A. Malcolm Campbell

# TODO change display to re-order rows in order of distance from GOI
# TODO make this a function with arguments as name of GOI and thresh 

# run this every time you open R for this kind of analysis
library(rnaseqWrapper)

# Load your data
#     the folder chosen in setwd needs to be customized for user and project
#     need subfolders geneResult and annotations, copied from server
setwd("/Users/macampbell/Desktop/Bio343")
countData <- mergeCountFiles("genes_Liver/")

# Pull out counts, make them integers, and view them
myCountData <- round( countData[,grep(".expected_count",names(countData))],0)
names(myCountData) <- gsub(".expected_count","",names(myCountData))

#filtering to keep only those genes whose mean expression is > 10, 
#play with this value to see how the size of toSearch changes.
myMeans <- apply(as.matrix(myCountData), 1, mean)
toSearch <- myCountData[myMeans > 10 & !is.na(myMeans),]

# Choose gene of interest by name from toSearch with filtered means >10,
# could compare with myCountData; goi = gene of interest
goi = which(rownames(toSearch)=="Contig12414_Npffr1_Neuropeptide_FF_receptor_1_Rattus_norvegicus")

# Compute distance from goi to every other gene using (1-corr)/2
# If you don’t get enough hits, you could go back and do this for myCountData instead. 
# goiData = myCountData[goi,]
goiData = toSearch[goi,]
corrgoi = (1 - cor(t(goiData), t(toSearch)))/2

# Get count data for genes within threshold distance of goi looking at toSearch only
# could compare with myCountData, but result may be too big and gum up R
# threshold currently set 0.1 but you can adjust to experiment with cluster size
thresh = 0.1
supClustCounts = toSearch[corrgoi < thresh ,] # ordered by gene number
supClustOrder = order(corrgoi)[1:nrow(supClustCounts)]
supClustCounts = toSearch[supClustOrder ,] # ordered by distance from goi

# Save supervised cluster to CSV file in working directory
write.csv(supClustCounts,file="Npffr1.csv")
# could print/save just the gene names stored in variable genes
# genes = rownames(supClustCounts) 

# Plot heatmap -- wanted rows in order of similarity to goi, but couldn't figure this out
heatsup = heatmap.mark(as.matrix(supClustCounts), scale="none", margins = c(15,10),
                       distfun=function(x) as.dist(( 1 - cor(t(x)) )/2))

# Image may be way to display data in order of similarity (unclustered)
#image(as.matrix(t(supClustCounts)),col=topo.colors(100))