% % NOTE -- ONLY EDIT THE .Rnw FILE!!! The .tex file is % likely to be overwritten. % \documentclass[landscape]{article} \usepackage{amsmath,pstricks} \usepackage[authoryear,round]{natbib} \usepackage{hyperref} \usepackage{sectsty} \usepackage{landscape} \usepackage{graphics} \textwidth=10.9in \textheight=6.5in %\parskip=.3cm \oddsidemargin=.0in \evensidemargin=.0in \headheight=-.3in \newcommand{\scscst}{\scriptscriptstyle} \newcommand{\scst}{\scriptstyle} \newcommand{\dimfo}{\fontsize{14}{16}\selectfont} \newcommand{\expfo}{\fontsize{18}{22}\selectfont} \newcommand{\Rfunction}[1]{{\texttt{#1}}} \newcommand{\Robject}[1]{{\texttt{#1}}} \newcommand{\Rpackage}[1]{{\textit{#1}}} \newcommand{\Rmethod}[1]{{\texttt{#1}}} \newcommand{\Rfunarg}[1]{{\texttt{#1}}} \newcommand{\Rclass}[1]{{\textit{#1}}} \newcommand\bi{\begin{enumerate}} \newcommand\ei{\end{enumerate}} \textwidth=8.2in \bibliographystyle{plainnat} \renewcommand{\familydefault}{\sfdefault} \usepackage[T1]{fontenc} \newcommand\cp{{\clearpage}} \newcommand\sts[1]{\Huge \textbf{#1}} \newcommand\stsh[1]{\huge \textbf{#1}} \newcommand\stsL[1]{\Large \textbf{#1}} \newcommand\stsl[1]{\large \textbf{#1}} \begin{document} \fontsize{18}{22} \selectfont \allsectionsfont{\sffamily} \sectionfont{\fontfamily{phv}\fontsize{18}{22}\selectfont} \subsectionfont{\fontfamily{phv}\fontsize{18}{22}\selectfont} \subsubsectionfont{\fontfamily{phv}\fontsize{18}{22}\selectfont} %\setkeys{Gin}{width=0.85\textwidth} { \Huge \begin{center} \textbf{Lecture 2: Container designs and methods \\ \copyright 2008 VJ Carey, Ph.D. \\ Channing Lab} \bi \item concepts \bi \item metadata binding \item closure under subsetting \item formal class assignment supports multiple dispatch \ei \item expr+SNP: racExSet \item Gene sets \item networks/pathways \item hg18 anno tracks \item genomic strings \item machine learning output containers \ei \end{center} } \clearpage {\sts{Container concepts} \bi \item Recall schematic: N samples, G features are assayed, R sample-level variables (phenoData) are collected \item basic matrix accessor idioms in R: \bi \item X[G, ] -- all columns, rows identified by G \item X[, S] -- all rows, columns identified by S \item X[G, S] -- both rows and columns restricted \ei \item Bioconductor preserves that general concept with two twists \bi \item twist 1: introspection-oriented metadata must be bound in to X \item twist 2: the class of X must be closed under subsetting operations \ei \ei } \clearpage {\sts{The metadata twist} \bi \item experimental output should include provenance documentation \bi \item experimentData component \ei \item assay reporter nomenclature resolution must be supported \bi \item annotation/featureData component \ei \item variable names may need clarification \bi \item varMetadata component \ei \ei } \clearpage <>= library(MAQCsubset) if(!exists("afxsubRMAES")) data(afxsubRMAES) <>= afxsubRMAES @ \clearpage <>= experimentData(afxsubRMAES) @ \clearpage <>= featureData(afxsubRMAES) @ \clearpage <>= varMetadata(afxsubRMAES) @ \clearpage {\sts{The closure twist} \bi \item subsetting operations are common \bi \item focus on a gene set \item focus on samples sharing a phenotype \item filter genes with little variation in expression \ei \item complex operations need inputs with predictable structure \item therefore preserve structure across subsetting operations \ei } \clearpage <>= afxsubRMAES[1,1:2] @ \clearpage <>= experimentData(afxsubRMAES[1,1:2]) @ \clearpage {\sts{Upshots} \bi \item we have dramatically cut down the number of features and samples \item this was accomplished with Y = X[G, S] \item operations that work on ``full'' arrays still work on Y \item metadata on the retained features are still available for Y \ei } \clearpage {\sts{Principles} \bi \item the container class is derived from Biobase::eSet \item at least two major numerical/factor data components \bi \item Assay data (G x N) \bi \item attributes of features in feature data (G x q) \ei \item `pheno' data (sample-level, N x R) \bi \item attributes of variables varMetadata (R x s) \ei \ei \item textual prose metadata experimentData (MIAME schema, abstract) \item platform token (annotation) \item IF ANY OF THESE ARE ABSENT BE SURE YOU CAN JUSTIFY IT AS THE DEFICIT WILL BE PROPAGATED TO PEOPLE WHO COULD HAVE BENEFITED HAD IT BEEN REMEDIED IT \ei } \clearpage {\sts{More general containers} \bi \item racExSet: rare allele count + expression \ei {\large <>= library(GGtools) data(chr20GGdem) chr20GGdem @ } } \clearpage { \sts{Questions and answers} \bi \item questions: \bi \item You are interested in the hypothesis that a given SNP may be an eQTL for a given gene. How can you test it? \item You wish to check a certain interval around a gene for SNPs for which rare allele copy number is associated with expression. How? \item You wish to search the entire genome for SNPs that may be eQTL for any genes in a given pathway. How? \ei \item answers: \bi \item genotype the individuals on whom transcript profile arrays are available \item create a racExSet or allied structure \item use GGtools snpScreen, twSnpScreen or similar function \ei \ei } \clearpage { \stsL{How do these analyses work?} \bi \item the class structure is defined <>= getClass("racExSet") @ \item methods are defined for specific signatures (combinations of classes) \item validity checking on class instances yields guarantees that input to method has appropriate structure \item method can construct an instance of a result class \item chain of methods has high reliability of succeeding at each step \ei @ } \clearpage {\stsL{Another species of container: gene sets} <>= library(GSEABase) fl <- system.file("extdata", "Broad.xml", package="GSEABase") gs2 <- getBroadSets(fl)[[1]] gs2 getClass(class(gs2)) @ } \clearpage {\stsL{working with gene sets} <>= geneIds(gs2)[1:5] gs3 = gs2 geneIdType(gs3) = AnnotationIdentifier("hgu133plus2") geneIds(gs3)[1:5] gs4 = gs2 geneIdType(gs4) = AnnotationIdentifier("illuminaHumanv2") geneIds(gs4)[1:5] gs3 gs4 @ } \clearpage {\stsL{containers for graphs and networks} <>= library(keggorth) data(KOgraph) KOgraph nodes(KOgraph)[1:4] adj(KOgraph, "Metabolism") @ } \clearpage {\sts{Visualizing} <>= library(Rgraphviz) plot(KOgraph) @ } \clearpage {\sts{Visualizing a pathway} <>= library(pathRender) data(pancrCaIni) plot(pancrCaIni, nodeAttrs=pwayRendAttrs(pancrCaIni)) @ } \clearpage {\stsL{Enhancing a pathway diagram with observational data} <>= library(ALL) data(ALL) library(hgu95av2.db) rmap = revmap(hgu95av2SYMBOL) rALL = reduceES( ALL, nodes(pancrCaIni), rmap, collapseFun=mean ) par(mfrow=c(2,2)) plotExGraph(pancrCaIni, rALL, 1, main="BCR/ABL 1") text(120,780,"BCR/ABL 1", cex=1.3) plotExGraph(pancrCaIni, rALL, 3, main="BCR/ABL 2") text(120,780,"BCR/ABL 2", cex=1.3) plotExGraph(pancrCaIni, rALL, 3, main="NEG 1") text(120,780,"NEG 1", cex=1.3) plotExGraph(pancrCaIni, rALL, 4, main="ALL1/AF4 1") text(120,780,"ALL1/AF4 1", cex=1.3) par(mfrow=c(1,1)) @ } \clearpage {\stsL{creating a pathway graph} \begin{verbatim} \end{verbatim} } \clearpage <>= ccp = fromGXL(file("cc.gxl")) plot(ccp, nodeAttrs=pwayRendAttrs(ccp)) @ \clearpage {\stsL{Other approaches} <>= library(cMAP) G1 = graphcMAP("stresspathway") plot(G1) @ } \clearpage {\stsL{hg18 track data} \bi \item UCSC genome browser widely used \item annotation 'tracks' are simple files consisting of coordinates and values \ei <>= library(encoDnaseI) data(rawCD4) rawCD4 @ } \clearpage <>= plot(getTrkXY(rawCD4[chrnum(20),])) @ \clearpage {\sts{Biostring containers} <>= library(Biostrings) d <- DNAString("TTGAAAA-CTC-N") length(d) alphabet(d) views(d, c(1,2,3,4),c(6,7,8,9)) mm = matchPattern("AA", d) start(mm) @ } \clearpage <>= pom = readFASTA("pombe_chr02_region.fasta") names(pom) names(pom[[1]]) poms = DNAString(pom[[1]]$seq) poms aam = matchPattern("AA", poms) class(aam) @ \clearpage {\stsL{A last type of container related to analysis} <>= library(MLInterfaces) library(golubEsets) data(Golub_Train) ld1 = MLearn(ALL.AML~., Golub_Train[400:800,], ldaI, xvalSpec("LOO")) ld1 class(ld1) confuMat(ld1) @ } \clearpage {\stsL{The classifierOutput container} <>= getClass(class(ld1)) @ } \clearpage {\stsh{Summary} \bi \item interactive computing: objects should have concise and suggestive representations \item `show' methods control this \item getClass shows the anatomy of a class; class definitions can employ other class definitions \item setValidity for a class imposes unbreakable conditions on class structure -- an object will not be created if these are not met \item array data, gene sets, networks, tracks, strings, classifiers all occupy containers with specified structure \item strings, matrices, vectors, lists can have arbitrary sizes and have only very rudimentary introspection \item contrast annotation-identified ExpressionSet with a matrix or list \item formal containers for downstream results facilitate linkage of workflow components (e.g., classifierOutput) \ei } \end{document}