## Supplementary materials, R code and data for

—Worboys et al. 2014—

Yinyin Yuan

Preparation

Firstly we load the SRM data exported from Skyline. In the Skyline export options, ensure that “ProteinName” under the “Protein” header, “PeptideSequence” under the “Peptides” header and “TotalArea” under the “PrecursorResults” header is selected for export. Also ensure the box “Pivot Replicate Name” is checked.

This section also loads R functions for later data analysis and visualisation. The data files are available as part of the paper supplement and also online at (http://yuanlab.org/software/QP/data.zip).

source("quantotypic_functions.R")
tab <- read.csv("./data/SkylineDataExport.csv", as.is = T, na.strings = c("#N/A", 
    "NA"))
head(tab)
##   PeptideSequence           ProteinName ASPC1.TotalArea HPAC.TotalArea
## 1        ADSEQLAR sp_O75116_ROCK2_HUMAN          255350         222579
## 2      DEEISAAAIK sp_O75116_ROCK2_HUMAN           46512          46935
## 3      QLDETNALLR sp_O75116_ROCK2_HUMAN          135661         352638
## 4      SQGGDGFYGR sp_O75116_ROCK2_HUMAN          114375         359897
## 5         YVIVSSK sp_O75116_ROCK2_HUMAN          202400         493709
## 6       EFVADETER sp_O75582_KS6A5_HUMAN           41442          89494
##   MIAPACA2.TotalArea PANC1.TotalArea PL45.TotalArea PL5.TotalArea
## 1               8108          224125         459135        429506
## 2              31569           49799          40292         47528
## 3              31853          114528         689544        254916
## 4              11339          167293         728011        269902
## 5              51458          228792        1166066        333686
## 6              48542           69313         349993        257830
##   BXPC3.TotalArea CAPAN2.TotalArea CFPAC1.TotalArea HPAFII.TotalArea
## 1           34557            12613             4131             1110
## 2            7344             5450             3535             4606
## 3           45788            40040            17902            16129
## 4           37274            19312             6081             2409
## 5           24437             7231             2130              462
## 6           53888            14170            47050             1408
##   PANC10_05.TotalArea SW1990.TotalArea
## 1                1995             2808
## 2                2747             5187
## 3               38985            33488
## 4                7628             7926
## 5                 982             2001
## 6                 725             4268

Take only validated peptides from synthetic peptide analysis.

val_peps <- read.csv("./data/PepResultsSummary_AllFilters.csv")
row1 <- tab$PeptideSequence %in% val_peps$Peptide
tab2 <- tab[row1, ]
tab <- tab2
colnames(tab)[2] <- "ProteinName.x"
x <- table(tab$ProteinName.x)
x[sort.list(x, decreasing = T)][1:5]
## 
##  sp|Q9Y6R4|M3K4_HUMAN sp|Q9Y5S2|MRCKB_HUMAN  sp|O00418|EF2K_HUMAN 
##                     8                     7                     6 
##  sp|P05129|KPCG_HUMAN   sp|P07948|LYN_HUMAN 
##                     6                     6
hist(x, main = "ProteinName.x frequency", br = 20, xlab = "Number of peptides per protein")

plot of chunk hist-protein

Extract protein names: We have used the Uniprot database and this section extracts the names from the full protein names in the database, e.g. “sp_075116_ROCK2_HUMAN” becomes “ROCK2”.

protein <- tab$ProteinName.x
protein <- sapply(protein, function(x) strsplit(x, split = "_HUMAN", fixed = T)[[1]][1])
protein <- sapply(protein, function(x) strsplit(x, split = "_", fixed = T)[[1]][length(strsplit(x, 
    split = "_", fixed = T)[[1]])])
protein <- sapply(protein, function(x) strsplit(x, split = "|", fixed = T)[[1]][length(strsplit(x, 
    split = "|", fixed = T)[[1]])])
tab$protein <- protein

Filtering of data and log transformation. This searches for the term “TotalArea” from the column names to extract all areas exported from the Skyline data file.

dat <- tab
dat <- data.frame(as.matrix(dat[, grepl("TotalArea", colnames(dat))]), protein = dat$protein, 
    peptide = dat$PeptideSequence, stringsAsFactors = F)
sum(is.na(dat))
## [1] 95
ncond <- length(dat[, grepl("TotalArea", colnames(dat))])
dat[, 1:ncond] <- log(dat[, 1:ncond])
dat <- dat[sort.list(dat$protein), ]
hist(rowSums(is.na(dat[, 1:ncond])), br = 100, xlab = "Number of missing data per peptide", 
    main = "")

plot of chunk hist-dat

Remove rows where there are less than 4 measurements in the first group. This also ensures that when we later compare group 1 with both groups there will be at least 4 measurements.

idx <- rowSums(!is.na(dat[, 1:6])) > 3
dat <- dat[idx, ]
dim(dat)
## [1] 411  14
x <- table(dat$protein)
idx <- dat$protein %in% names(x)[x > 2]
dat <- dat[idx, ]
idx <- dat$protein
dim(dat)
## [1] 411  14

Reformat data

proteinName <- dat$protein
peptideName <- dat$peptide
dat <- dat[, 1:12]
dat <- as.matrix(dat)
rownames(dat) <- NULL
head(dat)
##      ASPC1.TotalArea HPAC.TotalArea MIAPACA2.TotalArea PANC1.TotalArea
## [1,]           12.14          12.28              12.65           13.32
## [2,]           12.06          12.16              12.45           13.16
## [3,]           13.67          13.51              14.13           14.87
## [4,]           14.88          14.77              14.99           15.54
## [5,]           11.73          11.60              11.29           13.77
## [6,]           11.65          11.09              10.70           13.59
##      PL45.TotalArea PL5.TotalArea BXPC3.TotalArea CAPAN2.TotalArea
## [1,]         12.261         12.63          11.604           10.387
## [2,]         12.202            NA          10.631            9.634
## [3,]         13.853         14.14          12.831           11.653
## [4,]         14.803         14.97          14.076           12.924
## [5,]          6.946         11.44           9.334           10.476
## [6,]          8.688         11.52           8.716            9.765
##      CFPAC1.TotalArea HPAFII.TotalArea PANC10_05.TotalArea
## [1,]           11.053            9.567               9.573
## [2,]            9.834            7.871               8.305
## [3,]           12.448           10.609              10.862
## [4,]           13.505           12.237              12.216
## [5,]           10.019           10.485               3.970
## [6,]            9.862            9.519               5.017
##      SW1990.TotalArea
## [1,]            9.752
## [2,]            7.973
## [3,]           10.697
## [4,]           12.326
## [5,]            8.388
## [6,]            7.419

Visualise data

tmp <- dat
rownames(tmp) <- NULL
par(mfrow = c(1, 2), mar = c(8, 0, 2, 1))
image(t(tmp[, 1:6]), main = "Grp1")
image(t(tmp[, 7:12]), main = "Grp2")

plot of chunk image-dat

Discern quantotypic peptides and score protein

Peptide correlation

Compute peptide correlation using groups 1 (a1), 2 (a2) and both (a0). Here the computing takes a little longer, therefore the result has been pre-stored in a rdata file and can be directly loaded (Section2.rdata).

a1 <- asymptoticQP(dat[, 1:6], peptideName, proteinName, n = 6)
a2 <- asymptoticQP(dat[, 7:12], peptideName, proteinName, n = 6)
a0 <- asymptoticQP(dat, peptideName, proteinName, n = 12)
save(a1, a2, a0, file = "./data/Section2.rdata")

Plot distribution of correlations from groups 1, 2, and combined.

load(file = "./data/Section2.rdata")
th.a <- 0.05
par(mfrow = c(3, 1))
hist.sig(a1, th1 = 0, th2 = th.a, xlab = "Asymptotic correlation p-value", main = "Group 1")
hist.sig(a2, th1 = 0, th2 = th.a, xlab = "Asymptotic correlation p-value", main = "Group 2")
hist.sig(a0, th1 = 0, th2 = th.a, xlab = "Asymptotic correlation p-value", main = "Both groups")

plot of chunk hist-sig

The distribution can be plotted on the log10 scale.

par(mfrow = c(3, 1))
m <- max(c(-log10(a1), -log10(a2), -log10(a0)), na.rm = TRUE)
hist.sig(-log10(a1), th1 = -log10(0.05), th2 = max(-log10(a1), na.rm = T), xlab = "Asymptotic correlation -log10 p-value", 
    main = "Group 1")
hist.sig(-log10(a2), th1 = -log10(0.05), th2 = max(-log10(a2), na.rm = T), xlab = "Asymptotic correlation -log10 p-value", 
    main = "Group 2")
hist.sig(-log10(a0), th1 = -log10(0.05), th2 = max(-log10(a0), na.rm = T), xlab = "Asymptotic correlation -log10 p-value", 
    main = "Both")

plot of chunk hist-sig-log

Q-Q plot with the sorted correlation quantiles between two groups shows that the distribution of correlations between the two groups is very good.

qqplot(a1, a2, main = "Normal Q-Q plot of correlations", xlab = "Group 1", ylab = "Group 2")
abline(0, 1, col = "red")
text(0.46, 0.5, labels = expression(y == x), srt = 45, col = "red")

plot of chunk Q-Qplot

Write significant correlations to files

s1 <- getListQP(a1, sigTh = th.a, direction = "smaller", peptideName = peptideName, 
    proteinName = proteinName)
s2 <- getListQP(a2, sigTh = th.a, direction = "smaller", peptideName = peptideName, 
    proteinName = proteinName)
s0 <- getListQP(a0, sigTh = th.a, direction = "smaller", peptideName = peptideName, 
    proteinName = proteinName)
writeFilledTable(s1, file = paste("./figure/PeptideCorrelationPvalueList_Grp1.csv", 
    sep = ""), fileType = "csv")
write.csv(a1, file = paste("./figure/PeptideCorrelationPvalueMatrix_Grp1.csv", 
    sep = ""))
writeFilledTable(s2, file = paste("./figure/PeptideCorrelationPvalueList_Grp2.csv", 
    sep = ""), fileType = "csv")
write.csv(a2, file = paste("./figure/PeptideCorrelationPvalueMatrix_Grp2.csv", 
    sep = ""))
writeFilledTable(s0, file = paste("./figure/PeptideCorrelationPvalueList_Both.csv", 
    sep = ""), fileType = "csv")
write.csv(a0, file = paste("./figure/PeptideCorrelationPvalueMatrix_Both.csv", 
    sep = ""))

Comparison of peptide correlations between groups of samples

Compare peptide correlations in groups 1, 2 and both.

plotPeptideCorrelation(a1, direction = "smaller", sigTh = th.a)

plot of chunk PeptideCorrelation_Expr1

plotPeptideCorrelation(a2, direction = "smaller", sigTh = th.a)

plot of chunk PeptideCorrelation_Expr2

plotPeptideCorrelation(a0, direction = "smaller", sigTh = th.a)