R
The R and related Bioconductor packages can be invaluable to those of researchers in the life sciences. It is often reasonably well documented, capable of high-end statistical processes and can generate extremely complex and high end graphics. And the best part is that it is FREE.... The down side is there is a bit of a learning curve and there seems to be general assumption that you know a bit of standard command line stuff. The good news is there are a couple of good GUI interfaces that make things a bit easier, like RSTUDIO.
Standard R Prompt:
>
General Commands:
R - Starts an R session in a command terminal
q() - Quits an R session in a command terminal
getwd() - returns current working directory
setwd() - sets the currrent directory as the working directory
list.files() - shows all the files in the current directory (akin to ls in unix)
ls() OR objects() - returns a list of all objects in current session
WHAT ARE OBJECTS?
Good question. These are the things R creates and/or manipulates. These may be variables, arrays of numbers (vectors), character strings, functions, or more general structures built from such components.
rm() - allows you to delete an object(s) in the current session [ rm(foo) OR rm(foo, bar) ]
Using R as a Calculator:
#Find all the available "basic" mathmatical functions
> getGroupMembers(Arith)
[1] "+" "-" "*" "^" "%%" "%/%" "/"
#Open the related help page
> help(Arithmetic)
#Addition
> 2+2
[1] 4
#Subtraction
> 5-3
[1] 2
#Multiplication
> 4*3
[1] 12
#Division
> 20/4
[1] 5
>
Adding Values to Objects:
Option 1 - Manual Data Entry
## Assign the number 10 to an Object called A
# Classic method uses "less than" and dash
> A <- c(10)
# But these alternatives also work
> c(10) -> A
> assign("A", c(10))
## Assign the numbers 3, 4, 5, 6, 7 to an Object called B
> B <- c(3, 4, 5, 6, 7)
Seeing what Objects Exist:
> ls()
[1] "A" "B"
> objects()
[1] "A" "B"
Seeing the Contents of Objects:
> cat(A)
10
> cat(B)
3 4 5 6 7
> head(A)
[1] 10
> head(B)
[1] 3 4 5 6 7
Tutorial
Gene Size
KRAS 1000
NRAS 1200
TRAF3 1200
Read a tab separated table into R
> Table<-read.table("Example_Table.txt")
or
> Table<-read.table("Example_Table.txt",header=T)
Write a table out (this forces tab separation and ensures the row numbers used in R are not in the export
> write.table(sig, file="SignificantGenes2.txt", sep="\t", row.names=F)
Specific column for a table with a header
head(Table)
Genes<-Table$Genes
Math Operations
For any operation on one column of data, use Table$Columnname and insert the header for the column after the dollar sign.
In this example, "Size" is the name of the column we want to manipulate.
Mean
> mean(Table$Size)
Standard Deviation
> sd(Table$Size)
Median
> median(Table$Size)
Geomean
library(psych)
geometric.mean(Table$Size,na.rm=TRUE)
Mode
> freq <- tapply(Table$Size, Table$Size, length)
> as.numeric(names(freq)[which.max(freq)])
Quartile
> quartile(Table$Size)
Quantile (every 10%)
> quantile(Table$Size,probs=c(0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1))
Subset
- Used to extract rows of a table. Similar to grep or awk.
Usage: subset(Data_Table, Column/Operator/Value)
Operators:
XY Plots
#Download the test data from the data repository section of the site (HtSeq_StrandedVersusUnstranded.txt)
#Open R Studio
#Set working directory to location of downloaded files (>Session>Set Working Directory>Choose Directory)
#Load ggplot
>library(ggplot2)
>library(scales)
#Load data table into R
>xyTest <- read.table("HtSeq_StrandedVersusUnstranded.txt", header=T)
#View table structure
>head(xyTest)
#Basic XY Plot
>ggplot(xyTest, aes(Count_Unstranded, Count_Stranded)) + geom_point()
#Advanced XY Plot - (add 1 to all values to allow log scale plot, scale graph to log10 and include exponential annotation, add custom labels and title, set text size)
>ggplot(xyTest, aes(Count_Unstranded+1, Count_Stranded+1)) + geom_point() + scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x),labels = trans_format("log10", math_format(10^.x))) + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),labels = trans_format("log10", math_format(10^.x))) + ylab(label = "Stranded Count") + xlab(label = "Unstranded Count") + ggtitle("HtSeq - Stranded versus Unstranded") + theme(axis.text = element_text(size=16), axis.title = element_text(size=20), title = element_text(size=18))
#Custom with continuous color gradient of 7 rainbow colors, custom annotations both text and rectangles
>ggplot(sex7, aes(Mean, SD)) + geom_point(aes(color = Count)) + annotate("rect", xmin = 0.60, xmax = 0.73, ymin = 0.23, ymax = 0.29, alpha = .2) + annotate("rect", xmin=0.95, xmax=1.0, ymin=0.0, ymax=0.16, alpha=0.2) + annotate("text", x=0.67, y=0.19, label="Female\nX=0.60-0.73\nY=0.23-0.29") + annotate("text", x=0.9, y=0.085, label="Male\nX=0.95-1.00\nY=0.00-0.16") + scale_x_continuous(limits=c(0.5,1.001)) + scale_y_continuous(limits=c(0.0,0.501)) + theme(axis.text = element_text(size=12), axis.title = element_text(size=16)) + annotate("text", x=0.76, y=0.346, label="MMRF_1159", size=4) + annotate("text", x=0.905, y=0.174, label="MMRF_1687", size=4) + annotate("text", x=0.855, y=0.232, label="MMRF_1400", size=4) + annotate("text", x=0.895, y=0.196, label="MMRF_1089", size=4) + scale_color_gradientn("Informative\nVariants", colours=rainbow(7), limits=c(0.0,2000), na.value = "purple", guide = "colourbar")
#Get the correlation values; pearson [absolute value correlation] and spearman(rho) [rank-order correlation]
>cor(xyTest[c(2, 3)], method="spearman")
>cor(xyTest[c(2, 3)], method="pearson")
Working with a Large Matrix
#Load necessary libraries
library(ggplot2)
library(reshape2)
library(grid)
library(gridExtra) #For creating multiple plot pages
library(psych) #For geometric mean
#Read the table, since the matrix is numbers outside of the first row and column
#we indicate that there is a header and that the row name should be derived from the first column
TPM <- read.table("TPM_Matrix.txt", header=T, sep="\t", row.names=1, stringsAsFactors=F)
## REPLACE GeneX with your gene name
#Extract a gene of interest from the matrix for graphing
GeneX <- subset(TPM, subset=rownames(TPM)=="ENSGxxxxxxxxxxxx")
head(GeneX)
#Transpose the single row gene table into a two column table
mGeneX <- melt(GeneX)
head(mGeneX)
#Generate the various plots and then produce a single image with all 4 plots
a <- ggplot(mGeneX
, aes(x="GeneX", y=value)) + geom_boxplot()
b <- ggplot(mGeneX
, aes(x=value)) + geom_histogram(binwidth=5)
c <- ggplot(mGeneX
, aes(x="GeneX", y=value)) + geom_point(position = "jitter")
d <- ggplot(mGeneX
, aes(x="GeneX
", y=value)) + geom_dotplot(binaxis = "y", stackdir = "center")
grid.arrange(a, b, c, d, ncol=2, nrow=2)
## DYNAMIC code version, just update the first 2 lines to your gene name and ENSG_ID
HUGO <- "TRAF3"
ENSG <- "ENSG00000131323"
GeneX <- subset(TPM, subset=rownames(TPM)==ENSG)
mGeneX <- melt(GeneX)
a <-
ggplot(mGeneX, aes(x=HUGO, y=value)) + geom_boxplot() + xlab(label = "")
b <- ggplot(mGeneX
, aes(x=value)) + geom_histogram(binwidth=5)
c <- ggplot(mGeneX, aes(x=HUGO, y=value)) + geom_point(position = "jitter") + xlab(label = "")
d <- ggplot(mGeneX, aes(x=HUGO, y=value)) + geom_dotplot(binaxis = "y", stackdir = "center") + xlab(label = "")
grid.arrange(a, b, c, d, ncol=2, nrow=2)
#Get a list of the samples with TPM values above 100
subset(mGeneX, value > 100)
#To make it faster to load the matrix a second time
save(TPM, "TPM.rData")
#In subsequent runs load the data object instead of reading the table to make things faster
load("TPM.rData")
#More, load different matrix, calculate geometric means and merge and graph
> NFKB_Keats <- read.table("IA7_Keats_NFKB.txt", header=T, sep="\t", row.names=1, stringsAsFactors=F)
> gmNFKB_Keats <- geometric.mean(NFKB_Keats+1,na.rm=TRUE)
> MgmNFKB_Keats <- melt(gmNFKB_Keats)
> ggplot(MgmNFKB_Keats, aes(x=value)) + geom_histogram(binwidth=0.25)
> ggplot(MgmNFKB_Keats, aes(x=value)) + geom_histogram(binwidth=5)
> ggplot(MgmNFKB_Keats, aes(x="NFKB Index (Keats)", y=value)) + geom_point(position = "jitter")
> NFKB_Ann <- read.table("IA7_Annunziate_NFKB.txt", header=T, sep="\t", row.names=1, stringsAsFactors=F)
> gmNFKB_Ann <- geometric.mean(NFKB_Ann+1,na.rm=TRUE)
> MgmNFKB_Ann <- melt(gmNFKB_Ann)
> ggplot(MgmNFKB_Ann, aes(x=value)) + geom_histogram(binwidth=5)
> ggplot(MgmNFKB_Ann, aes(x=value)) + geom_histogram(binwidth=1)
> ggplot(MgmNFKB_Ann, aes(x="NFKB Index (Annunziate)", y=value)) + geom_point(position = "jitter")
> merge_NFKB <- merge(MgmNFKB_Keats, MgmNFKB_Ann, by="row.names")
> cor(merge_NFKB[c(2, 3)], method="pearson")
> cor(merge_NFKB[c(2, 3)], method="spearman")
> ggplot(merge_NFKB, aes(value.x, value.y)) + geom_point()
#Create a vector of ENSGxxx identifiers as "list" and a vector of samples as "samples"
#This will extract the genes in "list" and the samples in "samples"
list <- as.vector(keats_nfkb_list$V1)
Test2 <- subset(nonB, subset=rownames(nonB)%in%list, select=samples)
Mutation Plots
> library(ggplot2)
> library(reshape2)
> library(gridExtra)
> sigMutIA6 <- read.table("IA6_Significant_Mutations.txt", header=T)
> mSigMutIA6 <- melt(sigMutIA6, id.vars="Gene")
> full <- read.table("IA6_Significant_Mutations.txt", header=T)
> a <- ggplot(full, aes(x=Gene, y=log10(p), fill=Significant)) + geom_bar(stat="identity") + coord_flip() + geom_hline(aes(yintercept=log10(0.05)), colour="black", linetype="dashed") + ylab(label = "p-value (log10)") + xlab("") + scale_fill_discrete(name="Significant q-value ") + xlab("") + theme(axis.text.y = element_blank(), axis.ticks = element_blank(), axis.text.x = element_text(size = 14,colour = "black"), axis.title = element_text(size = 20), legend.title = element_text(size = 15), legend.text = element_text(size = 12), legend.position="top") + scale_y_reverse()
> b <- ggplot(mSigMutIA6, aes(x=Gene, y=value, fill=variable)) + geom_bar(stat='identity') + theme(axis.text.y = element_text(size = 14, color="black"), axis.text.x = element_text(size = 14,colour = "black"), axis.title = element_text(size = 20), legend.title = element_text(size = 15), legend.text = element_text(size = 12), legend.position="top") + ylab(label = "Somatic Mutation Count") + scale_fill_discrete(name="Mutation Type ", labels=c("Non-Silent", "Silent", "Non-Coding")) + xlab("") + coord_flip()
> grid.arrange(b,a,ncol=2,widths=c(3, 1))
#Create a plot for each gene showing DNA and RNA allele fraction correlations
> dnarna <- read.table("IA5_graph2.txt", header=T)
>
head(dnarna)
RNA_Alt_Freq DNA_Alt_Freq Gene
1 0.115869 0.283 ACTG1
2 0.307286 0.378 ACTG1
> ggplot(dnarna, aes(DNA_Alt_Freq, RNA_Alt_Freq)) + geom_point() + facet_wrap(~ Gene) + ylab(label = "Alt Allele Ratio - RNA") + xlab(label = "Alt Allele Ratio - DNA")
Making Venn Diagrams
#Load Venn Diagram library, it can make 2-,3-,4-, and 5-way venn diagrams.
library(VennDiagram)
#Create clean workspace
grid.newpage()
#Create two-way Venn plot; Numbers represent Total Group1 count, Total Group2 count, Shared Group 1 & 2 count
venn.plot <- draw.pairwise.venn(315, 148, 118, c("Strelka_Pass", "Seurat_MMRF_Filtered"))
#Print the plot
grid.draw(venn.plot)
#For a better looking version
venn.plot <- draw.pairwise.venn(area1 = 267,
+ area2 = 808,
+ cross.area = 193,
+ category = c("Strelka Pass", "Seurat Raw"),
+ fill = c("blue", "red"),
+ alpha = 0.3,
+ lty = "blank",
+ cex = 2,
+ cat.cex = 2,
+ cat.pos = c(285, 105),
+ cat.dist = 0.09,
+ cat.just = list(c(-1, -1), c(1, 1)),
+ ext.pos = 30,
+ ext.dist = -0.05,
+ ext.length = 0.85,
+ ext.line.lwd = 2,
+ ext.line.lty = "dashed")
#More usable version
venn.plot <- draw.pairwise.venn(1734, 1969, 1248, c("MuTect_1000ng", "MuTect_500ng"), euler.d=T , scaled=T, fill=c("blue", "red"), alpha=0.3, lty="blank", cex=2, cat.cex=2, cat.dist=0.03, cat.pos=c(-30,30) )
#Create three-way Venn plot
#Numbers are Area1, Area2, Area3, Intersect 1&2, Intersect 2&3, Intersect 1&3, Intersect 1,2,3
venn.plot <- draw.triple.venn(100, 150, 180, 50, 100, 30, 15, c("WGS", "WES", "RNA"))
venn.plot <- draw.triple.venn(area1=2437, area2=662, area3=6557, n12=245, n23=594, n13=1293, n123=177, c("Bolli", "CoMMpass IA3", "MMGI - Lohr"), euler.d=T, scaled=F,fill=c("Red", "Green", "Blue"), alpha=0.5, cex=1.5, cat.cex=1.7, cat.dist=0.05)
USING GGPLOT2
Hi-Low Distribution plot with Delta coloring
ggplot(Tab, aes(Sample, Top1, ymin = Top2, ymax = Top1, fill = Mean_Top_Delta)) + geom_crossbar(stat='identity', linetype='blank', width=0.75) + xlab(label="Samples Tested") + ylab(label="Top Two Isoform Range") + theme(axis.text.x = element_text(angle=90,hjust = 0.9,vjust=0.5), axis.title = element_text(size=20, vjust=0.4)) + scale_y_continuous(limits=c(0,1), breaks=seq(0, 1, 0.1)) + scale_fill_gradient2("Mean\nInter\nIsoform\nDelta", low="blue", mid = "yellow", high="red", limits=c(0.0,1.0), midpoint = 0.5) + geom_hline(aes(yintercept=0.9), colour="blue", linetype="dashed")
Box Plot/Dot Plot
#Load the data table
All<-read.table("Full.txt",header=T)
#Box Plot
ggplot(All, aes(factor(Class), Ratio)) + geom_boxplot()
#Dot Plot
ggplot(All, aes(factor(Class), Ratio)) + geom_jitter(aes(colour = Alleles))
Box Plot/Dot Plot with x axis grouped into deciles
#Load needed library's
> library(ggplot2) #used to create the plot
> library(gdata) #used to import excel.xls or excel.xlsx files as a data.frame object
> library(Hmisc) #used to make a factor variable which groups the x axis data of a scatter plot into deciles
#read excel file, create factor variable, and plot data
> dat <- read.xls("filename.xlsx") #reads any excel file into R as a data.frame with headers
> RIN <- cut2(dat$AliquotStock_RIN, g=10) #cut2 (part of the Hmisc library) creates a factor. the g=10 specifies 10 groups of equal observations (deciles)
> ggplot(dat, aes(factor(RIN), dat$picardTools.rnaMetrics.PCT_MRNA_BASES)) + geom_jitter()
Histogram
Freq2<-read.table("Frequency_Table2.txt",header=T)
library(ggplot2)
p <- ggplot(Freq2, aes(x=Count))
p + geom_histogram(aes(fill = Assay),binwidth = 10)
ggplot(Freq2) + geom_histogram(aes(x=Ratio,fill = Assay),binwidth = .02)
#CODE FOR GAINED REGIONS
ggplot(CRD, aes(x=X1q21)) + geom_histogram(binwidth=0.01) + geom_vline(xintercept = -1.0, colour="green", linetype = "longdash", size = 1) + geom_vline(xintercept = 0.5849, colour="orange", linetype = "longdash", size = 1) + geom_vline(xintercept = 1.0, colour="red", linetype = "longdash", size = 1) + geom_vline(xintercept = 0.0, colour="darkgrey", linetype = "longdash", size = 1) + geom_vline(xintercept = 0.1375, colour="purple", linetype = "dashed", size = 1) + geom_vline(xintercept = 0.3219, colour="purple", linetype = "solid", size = 1) + theme(plot.title = element_text(size=20), axis.text = element_text(size=16), axis.title = element_text(size=20), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) + labs(title = "1q21 - CKS1B") + ylab(label = "Count") + xlab(label = "Copy Number State [log2(T/N)]")
#CODE FOR DELETED REGIONS
ggplot(CRD, aes(x=X17p13)) + geom_histogram(binwidth=0.01) + geom_vline(xintercept = -1.0, colour="green", linetype = "longdash", size = 1) + geom_vline(xintercept = 0.5849, colour="orange", linetype = "longdash", size = 1) + geom_vline(xintercept = 1.0, colour="red", linetype = "longdash", size = 1) + geom_vline(xintercept = 0.0, colour="darkgrey", linetype = "longdash", size = 1) + geom_vline(xintercept = -0.1520, colour="purple", linetype = "dashed", size = 1) + geom_vline(xintercept = -0.4150, colour="purple", linetype = "solid", size = 1) + theme(plot.title = element_text(size=20), axis.text = element_text(size=16), axis.title = element_text(size=20), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) + labs(title = "17p13 - TP53") + ylab(label = "Count") + xlab(label = "Copy Number State [log2(T/N)]")
ggsave("1q21.png")
Line Diagram
#This plots two different data sources on the same graph.
#A smoothed trendline is applied and the confidence interval is suppressed (se=F)
>ggplot() + stat_smooth(data=cov3, aes(percent_gc, normalized_coverage, color=Sample), se=F, method = glm, formula= y ~ ns(x,15)) + stat_density(data=cov, aes(x=X.gc,y=..density../10))+ ylab(label = "Normalized Coverage") + xlab(label = "Percent GC of Capture Target") + theme(axis.text = element_text(size=16), axis.title = element_text(size=20), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) + ylim(limits=c(0,1))
More Examples
> Dist <- read.table("Intermarker_Distance.txt", header = T)
> library(ggplot2)
#These make things work
> library(MASS)
> library(scales)
> ggplot(Dist) + geom_histogram(aes(x=InterMarkerDistance), binwidth = 1000) + theme(axis.text = element_text(size = 16)) + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title = element_text(size=20)) + scale_x_continuous(name="Inter-Marker Distance", labels = comma, limits=c(0,500000), breaks=c(0,50000,100000,150000,200000,250000,300000,350000,400000,450000,500000))
> ggplot(Dist, aes(factor(CHR), InterMarkerDistance)) + geom_boxplot() + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),labels = trans_format("log10", math_format(10^.x))) + annotation_logticks(sides = "l") + ylab(label = "Inter-Marker Distance (bp)") + xlab(label = "Chromosome") + theme(axis.text = element_text(size=16), axis.title = element_text(size=20))
# Bar Chart
ggplot(dat, aes(factor(dat$Hugo), dat$n_nonsilent, fill=dat$TPM_2)) +
geom_bar(stat = "identity") +
theme(axis.text.y = element_text(size = 14), axis.text.x = element_text(angle=90,size = 14,colour = "black", hjust = 0.9, vjust=0.5), axis.title = element_text(size = 24), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) +
scale_fill_gradient2("Log2\nMedian\n(TPM+1)", low="blue", mid="yellow", high="red", midpoint = 5) +
ylab(label = "Non-synonomous Mutation Count") +
xlab(label = "")
Create a Map
library(ggplot2)
library("PBSmapping")
library("data.table")
library(maptools)
library(maps)
mmrf_map <- read.table("map.txt", header=T)
> head(mmrf_map)
Vertical Horizontal
1 33.75286 -84.39013
2 34.11918 -84.30292
3 35.68612 -79.82919
vert <- as.vector(mmrf_map$Vertical)
hort <- as.vector(mmrf_map$Horizontal)
xlim=c(-150,25)
ylim=c(20,70)
worldmap = map_data("world")
setnames(worldmap, c("X","Y","PID","POS","region","subregion"))
worldmap = clipPolys(worldmap, xlim=xlim,ylim=ylim, keepExtra=TRUE)
statemap = map_data("state")
setnames(statemap, c("X","Y","PID","POS","region","subregion"))
statemap = clipPolys(statemap, xlim=xlim,ylim=ylim, keepExtra=TRUE)
test.y <- c(33.752856, 34.119177, 35.686122, 45.778852, 51.083333, 53.55)
test.x <- c(-84.39013, -84.30292, -79.82919, -108.5742, -114.083333, -113.5)
p <- ggplot() + coord_map(xlim=xlim,ylim=ylim) + geom_polygon(data=worldmap,aes(X,Y,group=PID), fill = "grey70",color="grey50") + geom_polygon(data=statemap,aes(X,Y,group=PID), fill = "grey70",color="grey50")
p + geom_point(aes(x=test.x, y=test.y), color="blue", size=3)
p + geom_point(aes(x=hort, y=vert), color="blue", size=3)
Install packages from the command line
sudo R
install.packages(c("PackageName"),dependencies=T)
q()
Graphing
Histogram
As tabular data
> hist(Table$Size,plot=F)
Trim value list to those that are less than 500
> Trimmed<-Table$Size[Table$Size<500]
Graph trimmed with bins from 0-500 with length of 10
> bins=seq(0,500,by=10)
> hist(Trimmed,breaks=bins)