library("DESeq2")
## Loading required package: GenomicRanges
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     Filter, Find, Map, Position, Reduce, anyDuplicated, append,
##     as.data.frame, as.vector, cbind, colnames, do.call,
##     duplicated, eval, evalq, get, intersect, is.unsorted, lapply,
##     mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, rank, rbind, rep.int, rownames, sapply, setdiff,
##     sort, table, tapply, union, unique, unlist
## 
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Rcpp
## Loading required package: RcppArmadillo
file<-"/project/umw_biocore/genes2.tsv"

#1. Read the data 
rsem<- read.table(file);
head(rsem)
##         V1                               V2         V3         V4
## 1     gene                       transcript exper_rep1 exper_rep2
## 2 AK212155                       uc029wnt.1       0.00       0.00
## 3      Sp2            uc007ldf.1,uc007ldg.1      52.00      47.00
## 4 AK051368                       uc009coj.1       4.39       1.11
## 5   Ubiad1                       uc008vup.1     121.00     125.00
## 6      Src uc008npa.1,uc008npb.1,uc012cia.1      21.00      35.00
##           V5           V6           V7           V8
## 1 exper_rep3 control_rep1 control_rep2 control_rep3
## 2       0.00         0.00         0.00         0.00
## 3      36.00        99.00        53.00        66.00
## 4       0.00         1.10         0.00         0.00
## 5      65.00       134.00        95.00       111.00
## 6      20.00        43.00        22.00        32.00
#2. Read the data with header
rsem<- read.table(file,sep="\t", header=TRUE)

#3. Read the data with row.names
rsem<- read.table(file,sep="\t", header=TRUE, row.names=1, stringsAsFactors = TRUE) 

#4. Create data structure for DESeq Analysis
data <- data.frame(rsem[, c("exper_rep1","exper_rep2","exper_rep3",
                            "control_rep1","control_rep2","control_rep3")])

#5. Make sure all the cells in the table are integer
cols = c(1:6);
data[,cols] = apply(data[,cols], 2, function(x) as.numeric(as.integer(x)))
#Make a histogram
hist(log(data$exper_rep1), breaks=100)

plot of chunk unnamed-chunk-1

#6. Make a scatter plot with the averages of the replicas

avgall<-cbind(rowSums(data[1:3])/3, rowSums(data[4:6])/3)
colnames(avgall)<-c("Treat", "Control")

#7. Make a simple scatter plot
plot(avgall)

plot of chunk unnamed-chunk-1

#8. Hmm!!! The values are ranging from 0 to 800k. So, let's use log2.
plot(log2(avgall))

plot of chunk unnamed-chunk-1

#9. Let's change the x and y titles
log2vals <- log2(avgall)
colnames(log2vals)<-c("log2(Treat)", "log2(Control)")
plot(log2vals)

plot of chunk unnamed-chunk-1

#10. Let's make this pretty using ggplot
library("ggplot2")
gdat<-data.frame(avgall)
ggplot() + 
  geom_point(data=gdat, aes_string(x="Treat", y="Control"), colour="black", alpha=6/10, size=3) +
  scale_x_log10() +scale_y_log10()

plot of chunk unnamed-chunk-1

######################################
## LETS START DESeq ANALYSIS
######################################
#
#11. Define conditions for each library
conds <- factor( c("Control","Control", "Control", "Treat", "Treat","Treat") )
colData = as.data.frame((colnames(data)));
colData<-cbind(colData, conds)
colnames(colData) = c("Control","group");
groups = factor(colData[,2]);

#12. Filter out the genes if the # of total reads in all libs below 10. 
sumd = apply(X=data,MARGIN=1,FUN=sum);
filtd = subset(data, sumd > 10);

#13. Create DESeq data set using prepared table. 
dds = DESeqDataSetFromMatrix(countData=as.matrix(filtd), colData=colData, design = ~ group);

#14. Run DESeq analysis
dds <- DESeq(dds);
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
#15. Put the results into variable. 
res <- results(dds);
#16. Look for the column descriptions in the results 
mcols(res, use.names=TRUE)
## DataFrame with 6 rows and 2 columns
##                        type                                    description
##                 <character>                                    <character>
## baseMean       intermediate                    the base mean over all rows
## log2FoldChange      results log2 fold change (MAP): group Treat vs Control
## lfcSE               results         standard error: group Treat vs Control
## stat                results         Wald statistic: group Treat vs Control
## pvalue              results      Wald test p-value: group Treat vs Control
## padj                results                           BH adjusted p-values
#17. select only significant ones
f1<-res[!is.na(res$padj) & !is.na(res$log2FoldChange), ]
res_selected<-f1[(f1$padj<0.01 & abs(f1$log2FoldChange)>1),]

#18. Add a legend for all data
Legend<-"All"
gdat1<-cbind(gdat, Legend)
gdat_selected<-gdat[rownames(res_selected),]

#19. Add a legend for only significant ones
Legend<-"Significant"
gdat_selected1<-cbind(gdat_selected, Legend)

#20. Merge selected and all data
gdat2<-rbind(gdat1, gdat_selected1)

#21. Make ggplot
ggplot() + 
  geom_point(data=gdat2, aes_string(x="Treat", y="Control", colour="Legend"), alpha=6/10, size=3) +
  scale_colour_manual(values=c("All"="darkgrey","Significant"="red"))+
  scale_x_log10() +scale_y_log10()

plot of chunk unnamed-chunk-1

gdatMA<-gdat2

gdatMA$Treat=gdatMA$Treat+1
gdatMA$Control=gdatMA$Control+1

g<-gdatMA
colnames(g)<-c("M", "A", "Legend")
g$M<-log2(gdatMA$Treat*gdatMA$Control/2)
g$A<-log2(gdatMA$Treat/gdatMA$Control)

g$M <- scale(g$M, center=TRUE, scale=TRUE);
g$A <- scale(g$A, center=TRUE, scale=TRUE);

ggplot() + 
  geom_point(data=g, aes_string(x="M", y="A",  colour="Legend"), alpha=6/10, size=3) +
  ylab("Log2 Fold Change (M)") +
  xlab("Log2 mean normalized counts (A)") +
  scale_colour_manual(values=c("All"="black","Significant"="red"))+
  geom_abline(slope=0, linetype=2)

plot of chunk unnamed-chunk-1

#22. If you want to save as pdf use the command below
ggsave("~/myplot.pdf")
## Saving 7 x 5 in image
######################################
## Visualizing a little more
######################################
library("RColorBrewer");
library("gplots");
## 
## Attaching package: 'gplots'
## 
## The following object is masked from 'package:IRanges':
## 
##     space
## 
## The following object is masked from 'package:stats':
## 
##     lowess
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100);

#23. For MA Plot
plotMA(dds,ylim=c(-2,2),main="DESeq2");