R

The R and related Bioconductor packages can be invaluable to those of researchers in the life sciences. It is often reasonably well documented, capable of high-end statistical processes and can generate extremely complex and high end graphics. And the best part is that it is FREE.... The down side is there is a bit of a learning curve and there seems to be general assumption that you know a bit of standard command line stuff. The good news is there are a couple of good GUI interfaces that make things a bit easier, like RSTUDIO.

Standard R Prompt:

General Commands:

R - Starts an R session in a command terminal

q() - Quits an R session in a command terminal

getwd() - returns current working directory

setwd() - sets the currrent directory as the working directory

list.files() - shows all the files in the current directory (akin to ls in unix)

ls() OR objects() - returns a list of all objects in current session

WHAT ARE OBJECTS?

Good question. These are the things R creates and/or manipulates. These may be variables, arrays of numbers (vectors), character strings, functions, or more general structures built from such components.

rm() - allows you to delete an object(s) in the current session [ rm(foo) OR rm(foo, bar) ]

Using R as a Calculator:

#Find all the available "basic" mathmatical functions

> getGroupMembers(Arith)

[1] "+" "-" "*" "^" "%%" "%/%" "/"

#Open the related help page

> help(Arithmetic)

#Addition

> 2+2

[1] 4

#Subtraction

> 5-3

[1] 2

#Multiplication

> 4*3

[1] 12

#Division

> 20/4

[1] 5

Adding Values to Objects:

Option 1 - Manual Data Entry

## Assign the number 10 to an Object called A

# Classic method uses "less than" and dash

> A <- c(10)

# But these alternatives also work

> c(10) -> A

> assign("A", c(10))

## Assign the numbers 3, 4, 5, 6, 7 to an Object called B

> B <- c(3, 4, 5, 6, 7)

Seeing what Objects Exist:

> ls()

[1] "A" "B"

> objects()

[1] "A" "B"

Seeing the Contents of Objects:

> cat(A)

> cat(B)

3 4 5 6 7

> head(A)

[1] 10

> head(B)

[1] 3 4 5 6 7

Tutorial

Gene Size

KRAS 1000

NRAS 1200

TRAF3 1200

Read a tab separated table into R

> Table<-read.table("Example_Table.txt")

> Table<-read.table("Example_Table.txt",header=T)

Write a table out (this forces tab separation and ensures the row numbers used in R are not in the export

> write.table(sig, file="SignificantGenes2.txt", sep="\t", row.names=F)

Specific column for a table with a header

head(Table)

Genes<-Table$Genes

Math Operations

For any operation on one column of data, use Table$Columnname and insert the header for the column after the dollar sign.

In this example, "Size" is the name of the column we want to manipulate.

Mean

> mean(Table$Size)

Standard Deviation

> sd(Table$Size)

Median

> median(Table$Size)

Geomean

library(psych)

geometric.mean(Table$Size,na.rm=TRUE)

Mode

> freq <- tapply(Table$Size, Table$Size, length)

> as.numeric(names(freq)[which.max(freq)])

Quartile

> quartile(Table$Size)

Quantile (every 10%)

> quantile(Table$Size,probs=c(0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1))

R2 (R squared)

summary(lm(ColumnA ~ ColumnB, data=DataTable))$r.squared

Subset

- Used to extract rows of a table. Similar to grep or awk.

Usage: subset(Data_Table, Column/Operator/Value)

Operators:

XY Plots

#Download the test data from the data repository section of the site (HtSeq_StrandedVersusUnstranded.txt)

#Open R Studio

#Set working directory to location of downloaded files (>Session>Set Working Directory>Choose Directory)

#Load ggplot

>library(ggplot2)

>library(scales)

#Load data table into R

>xyTest <- read.table("HtSeq_StrandedVersusUnstranded.txt", header=T)

#View table structure

>head(xyTest)

#Basic XY Plot

>ggplot(xyTest, aes(Count_Unstranded, Count_Stranded)) + geom_point()

#Advanced XY Plot - (add 1 to all values to allow log scale plot, scale graph to log10 and include exponential annotation, add custom labels and title, set text size)

>ggplot(xyTest, aes(Count_Unstranded+1, Count_Stranded+1)) + geom_point() + scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x),labels = trans_format("log10", math_format(10^.x))) + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),labels = trans_format("log10", math_format(10^.x))) + ylab(label = "Stranded Count") + xlab(label = "Unstranded Count") + ggtitle("HtSeq - Stranded versus Unstranded") + theme(axis.text = element_text(size=16), axis.title = element_text(size=20), title = element_text(size=18))

#Custom with continuous color gradient of 7 rainbow colors, custom annotations both text and rectangles

>ggplot(sex7, aes(Mean, SD)) + geom_point(aes(color = Count)) + annotate("rect", xmin = 0.60, xmax = 0.73, ymin = 0.23, ymax = 0.29, alpha = .2) + annotate("rect", xmin=0.95, xmax=1.0, ymin=0.0, ymax=0.16, alpha=0.2) + annotate("text", x=0.67, y=0.19, label="Female\nX=0.60-0.73\nY=0.23-0.29") + annotate("text", x=0.9, y=0.085, label="Male\nX=0.95-1.00\nY=0.00-0.16") + scale_x_continuous(limits=c(0.5,1.001)) + scale_y_continuous(limits=c(0.0,0.501)) + theme(axis.text = element_text(size=12), axis.title = element_text(size=16)) + annotate("text", x=0.76, y=0.346, label="MMRF_1159", size=4) + annotate("text", x=0.905, y=0.174, label="MMRF_1687", size=4) + annotate("text", x=0.855, y=0.232, label="MMRF_1400", size=4) + annotate("text", x=0.895, y=0.196, label="MMRF_1089", size=4) + scale_color_gradientn("Informative\nVariants", colours=rainbow(7), limits=c(0.0,2000), na.value = "purple", guide = "colourbar")

#Get the correlation values; pearson [absolute value correlation] and spearman(rho) [rank-order correlation]

>cor(xyTest[c(2, 3)], method="spearman")

>cor(xyTest[c(2, 3)], method="pearson")

Working with a Large Matrix

#Load necessary libraries

library(ggplot2)

library(reshape2)

library(grid)

library(gridExtra) #For creating multiple plot pages

library(psych) #For geometric mean

#Read the table, since the matrix is numbers outside of the first row and column

#we indicate that there is a header and that the row name should be derived from the first column

TPM <- read.table("TPM_Matrix.txt", header=T, sep="\t", row.names=1, stringsAsFactors=F)

## REPLACE GeneX with your gene name

#Extract a gene of interest from the matrix for graphing

GeneX <- subset(TPM, subset=rownames(TPM)=="ENSGxxxxxxxxxxxx")

head(GeneX)

#Transpose the single row gene table into a two column table

mGeneX <- melt(GeneX)

head(mGeneX)

#Generate the various plots and then produce a single image with all 4 plots

a <- ggplot(mGeneX, aes(x="GeneX", y=value)) + geom_boxplot()

b <- ggplot(mGeneX, aes(x=value)) + geom_histogram(binwidth=5)

c <- ggplot(mGeneX, aes(x="GeneX", y=value)) + geom_point(position = "jitter")

d <- ggplot(mGeneX, aes(x="GeneX", y=value)) + geom_dotplot(binaxis = "y", stackdir = "center")

grid.arrange(a, b, c, d, ncol=2, nrow=2)

## DYNAMIC code version, just update the first 2 lines to your gene name and ENSG_ID

HUGO <- "TRAF3"

ENSG <- "ENSG00000131323"

GeneX <- subset(TPM, subset=rownames(TPM)==ENSG)

mGeneX <- melt(GeneX)

a <- ggplot(mGeneX, aes(x=HUGO, y=value)) + geom_boxplot() + xlab(label = "")

b <- ggplot(mGeneX, aes(x=value)) + geom_histogram(binwidth=5)

c <- ggplot(mGeneX, aes(x=HUGO, y=value)) + geom_point(position = "jitter") + xlab(label = "")

d <- ggplot(mGeneX, aes(x=HUGO, y=value)) + geom_dotplot(binaxis = "y", stackdir = "center") + xlab(label = "")

grid.arrange(a, b, c, d, ncol=2, nrow=2)

#Get a list of the samples with TPM values above 100

subset(mGeneX, value > 100)

#To make it faster to load the matrix a second time

save(TPM, "TPM.rData")

#In subsequent runs load the data object instead of reading the table to make things faster

load("TPM.rData")

#More, load different matrix, calculate geometric means and merge and graph

> NFKB_Keats <- read.table("IA7_Keats_NFKB.txt", header=T, sep="\t", row.names=1, stringsAsFactors=F)

> gmNFKB_Keats <- geometric.mean(NFKB_Keats+1,na.rm=TRUE)

> MgmNFKB_Keats <- melt(gmNFKB_Keats)

> ggplot(MgmNFKB_Keats, aes(x=value)) + geom_histogram(binwidth=0.25)

> ggplot(MgmNFKB_Keats, aes(x=value)) + geom_histogram(binwidth=5)

> ggplot(MgmNFKB_Keats, aes(x="NFKB Index (Keats)", y=value)) + geom_point(position = "jitter")

> NFKB_Ann <- read.table("IA7_Annunziate_NFKB.txt", header=T, sep="\t", row.names=1, stringsAsFactors=F)

> gmNFKB_Ann <- geometric.mean(NFKB_Ann+1,na.rm=TRUE)

> MgmNFKB_Ann <- melt(gmNFKB_Ann)

> ggplot(MgmNFKB_Ann, aes(x=value)) + geom_histogram(binwidth=5)

> ggplot(MgmNFKB_Ann, aes(x=value)) + geom_histogram(binwidth=1)

> ggplot(MgmNFKB_Ann, aes(x="NFKB Index (Annunziate)", y=value)) + geom_point(position = "jitter")

> merge_NFKB <- merge(MgmNFKB_Keats, MgmNFKB_Ann, by="row.names")

> cor(merge_NFKB[c(2, 3)], method="pearson")

> cor(merge_NFKB[c(2, 3)], method="spearman")

> ggplot(merge_NFKB, aes(value.x, value.y)) + geom_point()

#Create a vector of ENSGxxx identifiers as "list" and a vector of samples as "samples"

#This will extract the genes in "list" and the samples in "samples"

list <- as.vector(keats_nfkb_list$V1)

Test2 <- subset(nonB, subset=rownames(nonB)%in%list, select=samples)

Mutation Plots

> library(ggplot2)

> library(reshape2)

> library(gridExtra)

> sigMutIA6 <- read.table("IA6_Significant_Mutations.txt", header=T)

> mSigMutIA6 <- melt(sigMutIA6, id.vars="Gene")

> full <- read.table("IA6_Significant_Mutations.txt", header=T)

> a <- ggplot(full, aes(x=Gene, y=log10(p), fill=Significant)) + geom_bar(stat="identity") + coord_flip() + geom_hline(aes(yintercept=log10(0.05)), colour="black", linetype="dashed") + ylab(label = "p-value (log10)") + xlab("") + scale_fill_discrete(name="Significant q-value ") + xlab("") + theme(axis.text.y = element_blank(), axis.ticks = element_blank(), axis.text.x = element_text(size = 14,colour = "black"), axis.title = element_text(size = 20), legend.title = element_text(size = 15), legend.text = element_text(size = 12), legend.position="top") + scale_y_reverse()

> b <- ggplot(mSigMutIA6, aes(x=Gene, y=value, fill=variable)) + geom_bar(stat='identity') + theme(axis.text.y = element_text(size = 14, color="black"), axis.text.x = element_text(size = 14,colour = "black"), axis.title = element_text(size = 20), legend.title = element_text(size = 15), legend.text = element_text(size = 12), legend.position="top") + ylab(label = "Somatic Mutation Count") + scale_fill_discrete(name="Mutation Type ", labels=c("Non-Silent", "Silent", "Non-Coding")) + xlab("") + coord_flip()

> grid.arrange(b,a,ncol=2,widths=c(3, 1))

#Create a plot for each gene showing DNA and RNA allele fraction correlations

> dnarna <- read.table("IA5_graph2.txt", header=T)

> head(dnarna)

RNA_Alt_Freq DNA_Alt_Freq Gene

1 0.115869 0.283 ACTG1

2 0.307286 0.378 ACTG1

> ggplot(dnarna, aes(DNA_Alt_Freq, RNA_Alt_Freq)) + geom_point() + facet_wrap(~ Gene) + ylab(label = "Alt Allele Ratio - RNA") + xlab(label = "Alt Allele Ratio - DNA")

Making Venn Diagrams

#Load Venn Diagram library, it can make 2-,3-,4-, and 5-way venn diagrams.

library(VennDiagram)

#Create clean workspace

grid.newpage()

#Create two-way Venn plot; Numbers represent Total Group1 count, Total Group2 count, Shared Group 1 & 2 count

venn.plot <- draw.pairwise.venn(315, 148, 118, c("Strelka_Pass", "Seurat_MMRF_Filtered"))

#Print the plot

grid.draw(venn.plot)

#For a better looking version

venn.plot <- draw.pairwise.venn(area1 = 267,

+ area2 = 808,

+ cross.area = 193,

+ category = c("Strelka Pass", "Seurat Raw"),

+ fill = c("blue", "red"),

+ alpha = 0.3,

+ lty = "blank",

+ cex = 2,

+ cat.cex = 2,

+ cat.pos = c(285, 105),

+ cat.dist = 0.09,

+ cat.just = list(c(-1, -1), c(1, 1)),

+ ext.pos = 30,

+ ext.dist = -0.05,

+ ext.length = 0.85,

+ ext.line.lwd = 2,

+ ext.line.lty = "dashed")

#More usable version

venn.plot <- draw.pairwise.venn(1734, 1969, 1248, c("MuTect_1000ng", "MuTect_500ng"), euler.d=T , scaled=T, fill=c("blue", "red"), alpha=0.3, lty="blank", cex=2, cat.cex=2, cat.dist=0.03, cat.pos=c(-30,30) )

#Create three-way Venn plot

#Numbers are Area1, Area2, Area3, Intersect 1&2, Intersect 2&3, Intersect 1&3, Intersect 1,2,3

venn.plot <- draw.triple.venn(100, 150, 180, 50, 100, 30, 15, c("WGS", "WES", "RNA"))

venn.plot <- draw.triple.venn(area1=2437, area2=662, area3=6557, n12=245, n23=594, n13=1293, n123=177, c("Bolli", "CoMMpass IA3", "MMGI - Lohr"), euler.d=T, scaled=F,fill=c("Red", "Green", "Blue"), alpha=0.5, cex=1.5, cat.cex=1.7, cat.dist=0.05)

USING GGPLOT2

Hi-Low Distribution plot with Delta coloring

ggplot(Tab, aes(Sample, Top1, ymin = Top2, ymax = Top1, fill = Mean_Top_Delta)) + geom_crossbar(stat='identity', linetype='blank', width=0.75) + xlab(label="Samples Tested") + ylab(label="Top Two Isoform Range") + theme(axis.text.x = element_text(angle=90,hjust = 0.9,vjust=0.5), axis.title = element_text(size=20, vjust=0.4)) + scale_y_continuous(limits=c(0,1), breaks=seq(0, 1, 0.1)) + scale_fill_gradient2("Mean\nInter\nIsoform\nDelta", low="blue", mid = "yellow", high="red", limits=c(0.0,1.0), midpoint = 0.5) + geom_hline(aes(yintercept=0.9), colour="blue", linetype="dashed")

Box Plot/Dot Plot

#Load the data table

All<-read.table("Full.txt",header=T)

#Box Plot

ggplot(All, aes(factor(Class), Ratio)) + geom_boxplot()

#Dot Plot

ggplot(All, aes(factor(Class), Ratio)) + geom_jitter(aes(colour = Alleles))

Box Plot/Dot Plot with x axis grouped into deciles

#Load needed library's

> library(ggplot2) #used to create the plot

> library(gdata) #used to import excel.xls or excel.xlsx files as a data.frame object

> library(Hmisc) #used to make a factor variable which groups the x axis data of a scatter plot into deciles

#read excel file, create factor variable, and plot data

> dat <- read.xls("filename.xlsx") #reads any excel file into R as a data.frame with headers

> RIN <- cut2(dat$AliquotStock_RIN, g=10) #cut2 (part of the Hmisc library) creates a factor. the g=10 specifies 10 groups of equal observations (deciles)

> ggplot(dat, aes(factor(RIN), dat$picardTools.rnaMetrics.PCT_MRNA_BASES)) + geom_jitter()

Histogram

Freq2<-read.table("Frequency_Table2.txt",header=T)

library(ggplot2)

p <- ggplot(Freq2, aes(x=Count))

p + geom_histogram(aes(fill = Assay),binwidth = 10)

ggplot(Freq2) + geom_histogram(aes(x=Ratio,fill = Assay),binwidth = .02)

#CODE FOR GAINED REGIONS

ggplot(CRD, aes(x=X1q21)) + geom_histogram(binwidth=0.01) + geom_vline(xintercept = -1.0, colour="green", linetype = "longdash", size = 1) + geom_vline(xintercept = 0.5849, colour="orange", linetype = "longdash", size = 1) + geom_vline(xintercept = 1.0, colour="red", linetype = "longdash", size = 1) + geom_vline(xintercept = 0.0, colour="darkgrey", linetype = "longdash", size = 1) + geom_vline(xintercept = 0.1375, colour="purple", linetype = "dashed", size = 1) + geom_vline(xintercept = 0.3219, colour="purple", linetype = "solid", size = 1) + theme(plot.title = element_text(size=20), axis.text = element_text(size=16), axis.title = element_text(size=20), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) + labs(title = "1q21 - CKS1B") + ylab(label = "Count") + xlab(label = "Copy Number State [log2(T/N)]")

#CODE FOR DELETED REGIONS

ggplot(CRD, aes(x=X17p13)) + geom_histogram(binwidth=0.01) + geom_vline(xintercept = -1.0, colour="green", linetype = "longdash", size = 1) + geom_vline(xintercept = 0.5849, colour="orange", linetype = "longdash", size = 1) + geom_vline(xintercept = 1.0, colour="red", linetype = "longdash", size = 1) + geom_vline(xintercept = 0.0, colour="darkgrey", linetype = "longdash", size = 1) + geom_vline(xintercept = -0.1520, colour="purple", linetype = "dashed", size = 1) + geom_vline(xintercept = -0.4150, colour="purple", linetype = "solid", size = 1) + theme(plot.title = element_text(size=20), axis.text = element_text(size=16), axis.title = element_text(size=20), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) + labs(title = "17p13 - TP53") + ylab(label = "Count") + xlab(label = "Copy Number State [log2(T/N)]")

ggsave("1q21.png")

Line Diagram

#This plots two different data sources on the same graph.

#A smoothed trendline is applied and the confidence interval is suppressed (se=F)

>ggplot() + stat_smooth(data=cov3, aes(percent_gc, normalized_coverage, color=Sample), se=F, method = glm, formula= y ~ ns(x,15)) + stat_density(data=cov, aes(x=X.gc,y=..density../10))+ ylab(label = "Normalized Coverage") + xlab(label = "Percent GC of Capture Target") + theme(axis.text = element_text(size=16), axis.title = element_text(size=20), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) + ylim(limits=c(0,1))

More Examples

> Dist <- read.table("Intermarker_Distance.txt", header = T)

> library(ggplot2)

#These make things work

> library(MASS)

> library(scales)

> ggplot(Dist) + geom_histogram(aes(x=InterMarkerDistance), binwidth = 1000) + theme(axis.text = element_text(size = 16)) + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title = element_text(size=20)) + scale_x_continuous(name="Inter-Marker Distance", labels = comma, limits=c(0,500000), breaks=c(0,50000,100000,150000,200000,250000,300000,350000,400000,450000,500000))

> ggplot(Dist, aes(factor(CHR), InterMarkerDistance)) + geom_boxplot() + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),labels = trans_format("log10", math_format(10^.x))) + annotation_logticks(sides = "l") + ylab(label = "Inter-Marker Distance (bp)") + xlab(label = "Chromosome") + theme(axis.text = element_text(size=16), axis.title = element_text(size=20))

# Bar Chart

ggplot(dat, aes(factor(dat$Hugo), dat$n_nonsilent, fill=dat$TPM_2)) +

geom_bar(stat = "identity") +

theme(axis.text.y = element_text(size = 14), axis.text.x = element_text(angle=90,size = 14,colour = "black", hjust = 0.9, vjust=0.5), axis.title = element_text(size = 24), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) +

scale_fill_gradient2("Log2\nMedian\n(TPM+1)", low="blue", mid="yellow", high="red", midpoint = 5) +

ylab(label = "Non-synonomous Mutation Count") +

xlab(label = "")

Create a Map

library(ggplot2)

library("PBSmapping")

library("data.table")

library(maptools)

library(maps)

mmrf_map <- read.table("map.txt", header=T)

> head(mmrf_map)

Vertical Horizontal

1 33.75286 -84.39013

2 34.11918 -84.30292

3 35.68612 -79.82919

vert <- as.vector(mmrf_map$Vertical)

hort <- as.vector(mmrf_map$Horizontal)

xlim=c(-150,25)

ylim=c(20,70)

worldmap = map_data("world")

setnames(worldmap, c("X","Y","PID","POS","region","subregion"))

worldmap = clipPolys(worldmap, xlim=xlim,ylim=ylim, keepExtra=TRUE)

statemap = map_data("state")

setnames(statemap, c("X","Y","PID","POS","region","subregion"))

statemap = clipPolys(statemap, xlim=xlim,ylim=ylim, keepExtra=TRUE)

test.y <- c(33.752856, 34.119177, 35.686122, 45.778852, 51.083333, 53.55)

test.x <- c(-84.39013, -84.30292, -79.82919, -108.5742, -114.083333, -113.5)

p <- ggplot() + coord_map(xlim=xlim,ylim=ylim) + geom_polygon(data=worldmap,aes(X,Y,group=PID), fill = "grey70",color="grey50") + geom_polygon(data=statemap,aes(X,Y,group=PID), fill = "grey70",color="grey50")

p + geom_point(aes(x=test.x, y=test.y), color="blue", size=3)

p + geom_point(aes(x=hort, y=vert), color="blue", size=3)

Install packages from the command line

sudo R

install.packages(c("PackageName"),dependencies=T)

q()

Graphing

Histogram

As tabular data

> hist(Table$Size,plot=F)

Trim value list to those that are less than 500

> Trimmed<-Table$Size[Table$Size<500]

Graph trimmed with bins from 0-500 with length of 10

> bins=seq(0,500,by=10)

> hist(Trimmed,breaks=bins)

Google Sites

Report abuse