% % NOTE -- ONLY EDIT THE .Rnw FILE!!! The .tex file is % likely to be overwritten. % \documentclass[landscape]{article} \usepackage{amsmath,pstricks} \usepackage[authoryear,round]{natbib} \usepackage{hyperref} \usepackage{sectsty} \usepackage{landscape} \usepackage{graphics} \textwidth=10.9in \textheight=6.5in %\parskip=.3cm \oddsidemargin=.0in \evensidemargin=.0in \headheight=-.3in \newcommand{\scscst}{\scriptscriptstyle} \newcommand{\scst}{\scriptstyle} \newcommand{\dimfo}{\fontsize{14}{16}\selectfont} \newcommand{\expfo}{\fontsize{18}{22}\selectfont} \newcommand{\Rfunction}[1]{{\texttt{#1}}} \newcommand{\Robject}[1]{{\texttt{#1}}} \newcommand{\Rpackage}[1]{{\textit{#1}}} \newcommand{\Rmethod}[1]{{\texttt{#1}}} \newcommand{\Rfunarg}[1]{{\texttt{#1}}} \newcommand{\Rclass}[1]{{\textit{#1}}} \newcommand\bi{\begin{itemize}} \newcommand\ei{\end{itemize}} \textwidth=8.2in \bibliographystyle{plainnat} \renewcommand{\familydefault}{\sfdefault} \usepackage[T1]{fontenc} \newcommand\cp{{\clearpage}} \newcommand\sts[1]{\Huge \textbf{#1}} \newcommand\stsh[1]{\huge \textbf{#1}} \newcommand\stsL[1]{\Large \textbf{#1}} \newcommand\stsl[1]{\large \textbf{#1}} \begin{document} \fontsize{18}{22} \selectfont \allsectionsfont{\sffamily} \sectionfont{\fontfamily{phv}\fontsize{18}{22}\selectfont} \subsectionfont{\fontfamily{phv}\fontsize{18}{22}\selectfont} \subsubsectionfont{\fontfamily{phv}\fontsize{18}{22}\selectfont} %\setkeys{Gin}{width=0.85\textwidth} { \Huge \begin{center} \textbf{Bioconductor training March 2008 \\ Lecture 1: Introductory} \end{center} \vspace*{2cm} \begin{itemize} \item quick case study on annotation \item survey: mRNA abundance, ChIP-chip, SNP+expression, CNV+expression \item schematization of HT experiments \item Bioc proficiency concepts; container design introduction \ei } \clearpage { \Huge \begin{center} \textbf{Pathway annotation case study \\ intro insert} \end{center} \begin{itemize} \item "Gene sets" are popular tools for analysis \item rapid survey of a large family of gene sets is facilitated by programming \item conversion of moderately conventional annotation for genes/gene sets to operators on Bioconductor data structures is illustrated \ei } \clearpage {\sts{Is my gene in any pathways?} \bi \item case of TBX21 \item KEGG -- nothing \item NCBI -- nothing \item What about the Broad GSEA-related gene sets? \item Bioconductor package GSEABase helps navigate these \ei } \clearpage {\stsL{broadsets.rda} <
>= library(GSEABase) if (!exists("broadsets")) load("broadsets.rda") broadsets class(broadsets) getClass(class(broadsets)) @ } \clearpage {\stsL{information on a set} <>= broadsets[[1]] details(broadsets[[1]]) @ } \clearpage {\stsL{GeneSetCollection operations} \bi \item a GeneSetCollection instance is an R list of GeneSets \item iteration over list elements is relatively easy in R \item need to know how to operate usefully on a GeneSet \item poked at one above with `details' method \item another method of interest: geneIds <>= geneIds(broadsets[[1]]) allids = lapply(broadsets, geneIds) tbxchk = sapply(allids, function(x) any(x == "TBX21")) sum(tbxchk) hastbx = which(tbxchk) sapply(broadsets[hastbx], setName) @ \ei } \clearpage {\stsL{More info} \bi \item we have identified 12 'gene sets' that include TBX21 \item what are they? we can see that one is just a cytoband \ei <>= broadsets[["V$LYF1_01"]] @ } \clearpage {\stsL{More info} <>= details(broadsets[["V$LYF1_01"]]) longDescription(broadsets[["V$LYF1_01"]]) @ \bi \item This longDescription result is not very useful ... it is a long string of HTML. If we write it to a file, we can run a browser. Use writeLines \ei } \clearpage \setkeys{Gin}{width=1.15\textwidth} \includegraphics{lyf1} \clearpage \includegraphics{lyf1b} \clearpage {\sts{Upshots} \bi \item A collection of over 3000 sets of genes is bound to a single R variable name (broadsets) \item Each set is self-documenting and includes a list of HUGO identifiers (as given by Broad) \item methods \verb+geneIds+, \verb+details+, \verb+longDescription+ provide uniform information on each set \item programming expertise useful \bi \item general string matching (\texttt{x == 'TBX21'}) or pattern matching (grep, caseconversion etc) available directly to constituents \item shortcuts \verb+broadsets[["V$LYF1_01"]]+ \ei \item exploit sets and their structures for thorough statistical analysis \ei } \clearpage {\stsL{Application -- note number of features retained} <>= library(Biobase) library(ALL) data(ALL) keep = broadsets[["V$LYF1_01"]] geneIdType(keep) = AnnotationIdentifier(annotation(ALL)) ALL[ geneIds(keep), ] @ } \clearpage {\sts{Four technologies} \bi \item the concepts \bi \item mRNA transcript abundance -- expression arrays \item binding of TFs to DNA -- ChIP-chip \item high-density genotyping -- SNP chips (with expression: genetical genomics) \item aCGH and other approaches to copy number variation (CNV) \ei \item What do we do with these? \bi \item differential expression, coexpression \item theory of transcriptional regulation \item genetic association studies \item assessment of genomic lesions \ei \ei } \clearpage <>= if (!("MAQCsubset" %in% search())) library(MAQCsubset) if (!exists("afxsubRMAES")) data(afxsubRMAES) if (!("harbChIP" %in% search())) library(harbChIP) if (!("Neve2006" %in% search())) library(Neve2006) if (!("GGtools" %in% search())) library(GGtools) if (!exists("neveExCGH")) data(neveExCGH) if (!exists("chr20GGdem")) data(chr20GGdem) data(harbChIP) load("scr2.rda") options(width=60) @ { \stsh{Differential expression: an example} <>= library(MAQCsubset) data(afxsubRMAES) # RMA preprocessing, ES = ExpressionSet plot( exprs(afxsubRMAES)["206253_at",] ~ afxsubRMAES$pctBrain ) @ } \clearpage {\sts{Questions} \bi \item How do we interpret `206253\_at' in the foregoing? \item How can we make a useful (hyperlinked) report on annotation related to the implicated gene? \ei } \clearpage {\sts{Solutions 1: annotate::lookUp} <>= library(hgu133plus2.db) library(annotate) lookUp("206253_at", "hgu133plus2", "GENENAME") @ } \clearpage {\sts{Solutions 2: annaffy::aafTableAnn} <>= library(annaffy) library(annotate) dem = aafTableAnn("206253_at", "hgu133plus2.db") saveHTML( dem, file="abc.html" ) @ } \clearpage \setkeys{Gin}{width=1.05\textwidth} \includegraphics{probeList} \clearpage { \stsh{ChIP-chip: an example} } \setkeys{Gin}{width=.85\textwidth} <>= library(harbChIP); data(harbChIP) par(mfrow=c(2,2)) dim(exprs(harbChIP)) qqnorm(exprs(harbChIP)[,"ABF1"], main="ABF1", pch=".") qqnorm(exprs(harbChIP)[,"BYE1"], main="BYE1", pch=".") qqnorm(exprs(harbChIP)[,"GAL80"], main="GAL80", pch=".") qqnorm(exprs(harbChIP)[,"HAL9"], main="HAL9", pch=".") par(mfrow=c(1,1)) @ \clearpage \setkeys{Gin}{width=1.10\textwidth} \includegraphics{GAL80} \clearpage {\stsh{Using the ChIP-chip data} <>= library(YEAST) higal80 = sort(-exprs(harbChIP)[,"GAL80"])[1:10] unlist(lookUp(names(higal80), "YEAST", "GENENAME")) library(parody) calout.detect(na.omit(exprs(harbChIP)[,"GAL80"]))$val @ } \clearpage {\stsh{Genetics of gene expression} \setkeys{Gin}{width=0.75\textwidth} <>= library(GGtools) data(chr20GGdem) snps(chr20GGdem)[2310:2314,1:4] # rare allele count plot(snps(chr20GGdem)["rs6060535",], exprs(chr20GGdem)["206918_s_at",], main="CPNE1") @ } \clearpage \setkeys{Gin}{width=0.75\textwidth} \includegraphics{doss} \clearpage {\stsh{Questions} \bi \item What is known about location and role of SNP rs6060535? \ei } \clearpage <>= library(biomaRt) mm = useMart("snp", dataset="hsapiens_snp") listFilters(mm)[1:2,] listAttributes(mm)[1:2,] @ \clearpage <>= getBM(attributes=c("chr_name", "chrom_start", "ensembl_external_gene_id", "ensembl_type", "ensembl_syn_summary", "snp_allele"), filters=c("refsnp"), value="rs6060535", mart=mm) @ \clearpage {\stsh{chromosomal lesions: example} <>= library(Neve2006); data(neveExCGH) experimentData(neveExCGH) @ } \clearpage <>= table(neveExCGH$tumorType, neveExCGH$geneCluster) @ \clearpage {\sts{chromosomal lesions: example} <>= plot(cloneMeta(neveExCGH)$kbGenome, logRatios(neveExCGH)[,1], pch=".", cex=4, xlab="kbGenome", ylab="log ratio", main = neveExCGH$tumorType[1]) @ } \clearpage <>= par(mfrow=c(2,2)) for (i in 1:4) plot(cloneMeta(neveExCGH)$kbGenome, logRatios(neveExCGH)[,i], pch=".", cex=4, xlab="kbGenome", ylab="log ratio", main = neveExCGH$tumorType[i]) par(mfrow=c(1,1)) @ \clearpage {\sts{Questions} \bi \item How can we use the aCGH log ratios to identify genomic locations of chromosomal lesions? \item How can we determine proximity of these lesions to genes? \item What is a sound approach to understanding joint behavior of genomic lesions, differential expression, and phenotype? \ei } \clearpage {\stsh{Common threads} \bi \item lots of features (high-throughput), not so many samples \item features have complex annotation \bi \item probe sequence and its context \item functional categorization (GO) \item pathway membership and role (KEGG, etc.) \ei \item samples (should) have complex annotation \bi \item experimental conditions \item phenotype, disease state, demographics \ei \item analytic agenda is open-ended, integrative \ei } \clearpage {\stsh{Two motivations to Bioconductor} \bi \item Software is a primary tool for discovery in high-throughput biology \bi \item must support methodologists, developers -- reduce barriers to entry (for methodologist) to biological applications \item must support/educate/inspire users -- reduce barriers to entry (for investigators) to new software and analyses \ei \item Many problems and solutions have shared structures \bi \item resources can be modified or extended for efficient reuse \item examples of reusable resources: \bi \item tools for inference on linear models \item tools for machine learning \item structures for multicomponent assay outputs \ei \ei \ei } \clearpage \begin{verbatim} raw __________ |..______|__ |..|...____|____ |..|...|.......| MD1: map geometry of chip |..|...|.......| S features to sequence/source |__|...|.......| per chip |___|.......| MD2: map sequence to biological |_______| reference annotation | MD3: record hybridization preprocessing metadata including phenotype | and experimental design V -------------------------------------------- N chips -> ExpressionSet instance \end{verbatim} \clearpage \begin{verbatim} N chips -> ExpressionSet instance phenoData: P id sex disease ... |----------------------| AssayData: N | | exprs N | | __________________________ | | | 2.2|... | | | | | 1.7| | | ------------------------ | .| | | + varMetadata | .| | | | .| | | G | | | | + featureData (probe metadata refs) | | | | + experimentData (MIAME) | | | | | | | | | | | | | | | | __________________________ \end{verbatim} \clearpage {\sts{Getting proficient with Bioconductor} \bi \item I: Centrality of software \bi \item Grasp the importance of independent functional modules (packages) in composing workflows \bi \item in R, packages include functions, data structure schemas, \textit{documentation}, example data, and test facilities \ei \bi \item Understand function application -- f(x) can return a multifaceted object that can describe itself \item programming shorthand: f(g(x)) instead of \{y=g(x); f(y)\}; x\$y instead of pData(x)[["y"]] \item applicative and vector-oriented programming \ei \item Learn how to design functions that do useful things that must be done repetitively (with parametrized variations) \item Think compositionally: the output of one procedure may be the input to another procedure \ei \ei } \clearpage {\sts{Getting proficient with Bioconductor} \bi \item II: Component reuse; usage patterns \bi \item integrative project: N samples, K technologies, $f_k$ features/technology \item samples share annotation on phenotype, demographics \item technologies share annotation across experiments \item features share annotation relative to genomic context \item minimize redundancy by factoring data and metadata into 'familiar' locations \item efficient use of factored resources requires good sense of cross-resource linkage \ei \ei } \clearpage {\stsh{Container classes} <>= getClass("eSet") @ } \clearpage {\stsh{Expression arrays} <>= afxsubRMAES @ } \clearpage {\stsh{ChIP-chip archive -- no standard yet} <>= harbChIP @ } \clearpage {\stsh{Genetics of gene expression -- early standard} <>= chr20GGdem @ } \clearpage {\stsh{CNV + expression -- no standard yet} <>= neveExCGH @ } \clearpage {\sts{Summary} \bi \item you can explore and manipulate data from major published experiments with a small number of operators \item R idioms must be mastered: X[G, S] represents a selection of features G on samples S in experiment X \item packaged or improvised analyses use functions and objects; reports use various kinds of serialization \item major first hurdles: understanding the containers in conjunction with the annotation system \ei } \end{document}