% % NOTE -- ONLY EDIT THE .Rnw FILE!!! The .tex file is % likely to be overwritten. % \documentclass[landscape]{article} \usepackage{amsmath,pstricks} \usepackage[authoryear,round]{natbib} \usepackage{hyperref} \usepackage{sectsty} \usepackage{landscape} \usepackage{graphics} \textwidth=10.9in \textheight=6.5in %\parskip=.3cm \oddsidemargin=.0in \evensidemargin=.0in \headheight=-.3in \newcommand{\scscst}{\scriptscriptstyle} \newcommand{\scst}{\scriptstyle} \newcommand{\dimfo}{\fontsize{14}{16}\selectfont} \newcommand{\expfo}{\fontsize{18}{22}\selectfont} \newcommand{\Rfunction}[1]{{\texttt{#1}}} \newcommand{\Robject}[1]{{\texttt{#1}}} \newcommand{\Rpackage}[1]{{\textit{#1}}} \newcommand{\Rmethod}[1]{{\texttt{#1}}} \newcommand{\Rfunarg}[1]{{\texttt{#1}}} \newcommand{\Rclass}[1]{{\textit{#1}}} \newcommand\bi{\begin{itemize}} \newcommand\ei{\end{itemize}} \textwidth=8.2in \bibliographystyle{plainnat} \renewcommand{\familydefault}{\sfdefault} \usepackage[T1]{fontenc} \newcommand\cp{{\clearpage}} \newcommand\sts[1]{\Huge \textbf{#1}} \newcommand\stsh[1]{\huge \textbf{#1}} \newcommand\stsL[1]{\Large \textbf{#1}} \newcommand\stsl[1]{\large \textbf{#1}} \begin{document} \fontsize{18}{22} \selectfont \allsectionsfont{\sffamily} \sectionfont{\fontfamily{phv}\fontsize{18}{22}\selectfont} \subsectionfont{\fontfamily{phv}\fontsize{18}{22}\selectfont} \subsubsectionfont{\fontfamily{phv}\fontsize{18}{22}\selectfont} %\setkeys{Gin}{width=0.85\textwidth} { \Huge \begin{center} \textbf{Lecture 5: statistical analysis with microarrays \\ \copyright 2008 VJ Carey PhD \\ Channing Lab} \end{center} \begin{itemize} \item sketches \item SAM \item hypergeometric testing \item GSA, safe \ei } \clearpage {\stsh{Sketch of the field} \bi \item gene-by-gene: lists of differentially expressed genes (two-sample problem, time course, factorial design) \bi \item differential with respect to sample characteristics/experimental conditions \ei \item signature: vector of discriminating genes (or transformations thereof) -- linear discriminant analysis, machine learning \bi \item variable selection and model formulation occur simultaneously \ei \item gene set enrichment analysis: prespecified families of genes checked for association with sample characteristics \item genetics of gene expression -- does genotype explain differential expression? \item copy number variation -- a pervasive determinant? \ei } \clearpage {\stsh{Basic statistical requirements} \bi \item categorical inference: $2 \times 2$ tables constructed with dichotomous margins "gene differentially expressed", "gene member of biologic category" \item analysis of variance: mean values of a continuous response (gene expression) compared across categories (phenotype, experimental condition) \bi \item $t$-test is a special case; linear regression closely allied \item censored or clustered outcomes may need to be accommodated \ei \item logistic regression; linear discriminant analysis -- predictive signature \ei } \clearpage {\sts{Novel requirements} \bi \item small sample adjustments -- SAM, dealing with explosive denominators in $t$-statistics \item regularization -- shrink unstable variance estimates towards a common value \item multiple comparisons adjustments \item inference on network structures \ei } \clearpage {\stsL{basic differential expression} \bi \item we will illustrate with the mouse asthma data mmAsth.rda \item wild-type mice will be tested for differential expression due to OVA exposure \item we will use nsFilter to get rid of unannotated and low-variability genes \ei <>= library(Biobase) load("mmAsth.rda") mml = mmAsth[, mmAsth$genotype == "WT"] library(genefilter) <>= mmlf = nsFilter(mml) <>= mmlf[-1] mmlf = mmlf[[1]] @ } \clearpage {\stsL{gene-specific t tests -- denominator explosion} <>= mmt = rowttests(mmlf, "trt") <>= den = mmt$dm / mmt$stat plot( den, mmt$stat, pch="." ) @ } \clearpage {\stsL{SAM regularization} <>= library(siggenes) <>= ss = sam(mmlf, "trt") <>= ss ss@s0 @ } \clearpage {\stsL{damping the denominators} <>= nstat = mmt$dm/(den + .397) plot((den+.397), nstat) @ } \clearpage <>= plot(ss, 2.2) @ \clearpage {\stsL{choice} <>= sig2.2 = summary(ss, 2.2)@row.sig.genes dprobes = names(sig2.2) dprobes library(GO.db) <>= gol = as.list(GOTERM) <>= allt = sapply(gol, Term) ans = allt[allt == "inflammatory response"] gotag = names(ans) @ } \clearpage {\stsL{$2 \times 2$ table} <>= allp = featureNames(mmlf) library(mouse4302.db) inir = allp %in% mouse4302GO2ALLPROBES[[gotag]] inde = allp %in% dprobes table(inir, inde) fisher.test(table(inir, inde)) @ } \clearpage {\stsL{many 2x2 tables} <>= library(GSEABase) <>= gsc = GeneSetCollection(mmlf, setType=GOCollection()) <>= sz = sapply(gsc, function(x) length(geneIds(x))) gsc = gsc[sz > 30] gsc @ \clearpage <>= listOfSets = lapply(gsc, geneIds) vecOfTests = lapply(listOfSets, function(x) fisher.test(table(allp %in% x, inde))$p.value) kp = gsc[ wv <- which(unlist(vecOfTests) < 0.01 ) ] cbind( tm = sapply( lookUp( names(kp), "GO", "TERM"), Term ), pval = vecOfTests[ wv ] )[ order(unlist(vecOfTests)[wv]), ] @ } \clearpage \begin{verbatim} > library(GSA) > afl = lapply(kp, geneIds) > gsa1 = GSA( exprs(mmlf), as.numeric(mmlf$trt), afl, featureNames(mmlf), resp.type="Two class unpaired") \end{verbatim} \clearpage \begin{verbatim} > GSA.listsets(gsa1, FDRcut=.8) $FDRcut [1] 0.8 $negative Gene_set Gene_set_name Score p-value FDR [1,] "8" "xxxxxx" "-0.2509" "0.045" "0.27" [2,] "13" "xxxxxx" "-0.4888" "0.045" "0.27" [3,] "5" "xxxxxx" "-0.2374" "0.08" "0.32" [4,] "6" "xxxxxx" "-0.2366" "0.14" "0.32" [5,] "11" "xxxxxx" "-0.3267" "0.145" "0.32" [6,] "3" "xxxxxx" "-0.2455" "0.16" "0.32" [7,] "7" "xxxxxx" "-0.1402" "0.205" "0.345" [8,] "4" "xxxxxx" "-0.2632" "0.23" "0.345" [9,] "12" "xxxxxx" "-0.0054" "0.445" "0.5933" $positive Gene_set Gene_set_name Score p-value FDR [1,] "9" "xxxxxx" "0.2647" "0.05" "0.42" [2,] "1" "xxxxxx" "0.4317" "0.07" "0.42" $nsets.neg [1] 9 $nsets.pos [1] 2 names(kp) [1] "GO:0007155" "GO:0006817" "GO:0006955" "GO:0045087" "GO:0006954" [6] "GO:0006952" "GO:0007067" "GO:0006935" "GO:0005576" "GO:0005615" [11] "GO:0009897" "GO:0005125" "GO:0008009" \end{verbatim} \clearpage \begin{verbatim} library(safe) ssa1 = safe(mmlf, as.numeric(mmlf$trt), annotate="GO.MF", platform="mouse4302" ) SAFE results: Local: t.Student Global: Wilcoxon Method: permutation Size Mean.Rank Emp.pvalue GO:0051864 2 7382.5 0.356 GO:0008252 7 6681.9 0.566 GO:0008294 2 6026.5 0.63 GO:0008253 6 6039.3 0.713 GO:0008266 2 3255.0 0.884 \end{verbatim} \clearpage \begin{verbatim} > ssa2 = safe(mmlf, as.numeric(mmlf$trt), annotate="GO.BP", platform="mouse4302" ) > ssa2[1:5,] SAFE results: Local: t.Student Global: Wilcoxon Method: permutation Size Mean.Rank Emp.pvalue GO:0006824 3 9074.3 0.098 GO:0006820 96 7274.1 0.227 GO:0006821 31 6712.7 0.489 GO:0006825 9 6509.9 0.489 GO:0002320 3 3493.3 0.954 \end{verbatim} \clearpage \begin{verbatim} > which(gotag == names(ssa2@global.stat)) [1] 575 SAFE results: Local: t.Student Global: Wilcoxon Method: permutation Size Mean.Rank Emp.pvalue GO:0006954 141 8487.5 0.03 \end{verbatim} \end{document}