%
% NOTE -- ONLY EDIT THE .Rnw FILE!!! The .tex file is
% likely to be overwritten.
%
\documentclass[landscape]{article}
\usepackage{amsmath,pstricks}
\usepackage[authoryear,round]{natbib}
\usepackage{hyperref}
\usepackage{sectsty}
\usepackage{landscape}
\usepackage{graphics}
\textwidth=10.9in
\textheight=6.5in
%\parskip=.3cm
\oddsidemargin=.0in
\evensidemargin=.0in
\headheight=-.3in
\newcommand{\scscst}{\scriptscriptstyle}
\newcommand{\scst}{\scriptstyle}
\newcommand{\dimfo}{\fontsize{14}{16}\selectfont}
\newcommand{\expfo}{\fontsize{18}{22}\selectfont}
\newcommand{\Rfunction}[1]{{\texttt{#1}}}
\newcommand{\Robject}[1]{{\texttt{#1}}}
\newcommand{\Rpackage}[1]{{\textit{#1}}}
\newcommand{\Rmethod}[1]{{\texttt{#1}}}
\newcommand{\Rfunarg}[1]{{\texttt{#1}}}
\newcommand{\Rclass}[1]{{\textit{#1}}}
\newcommand\bi{\begin{itemize}}
\newcommand\ei{\end{itemize}}
\textwidth=8.2in
\bibliographystyle{plainnat}
\renewcommand{\familydefault}{\sfdefault}
\usepackage[T1]{fontenc}
\newcommand\cp{{\clearpage}}
\newcommand\sts[1]{\Huge \textbf{#1}}
\newcommand\stsh[1]{\huge \textbf{#1}}
\newcommand\stsL[1]{\Large \textbf{#1}}
\newcommand\stsl[1]{\large \textbf{#1}}
\begin{document}
\fontsize{18}{22}
\selectfont
\allsectionsfont{\sffamily}
\sectionfont{\fontfamily{phv}\fontsize{18}{22}\selectfont}
\subsectionfont{\fontfamily{phv}\fontsize{18}{22}\selectfont}
\subsubsectionfont{\fontfamily{phv}\fontsize{18}{22}\selectfont}
%\setkeys{Gin}{width=0.85\textwidth}
{
\Huge
\begin{center}
\textbf{Bioconductor training March 2008 \\
Lecture 1: Introductory}
\end{center}
\vspace*{2cm}
\begin{itemize}
\item quick case study on annotation
\item survey: mRNA abundance, ChIP-chip, SNP+expression, CNV+expression
\item schematization of HT experiments
\item Bioc proficiency concepts; container design introduction
\ei
}
\clearpage
{
\Huge
\begin{center}
\textbf{Pathway annotation case study \\ intro insert}
\end{center}
\begin{itemize}
\item "Gene sets" are popular tools for analysis
\item rapid survey of a large family of gene sets is
facilitated by programming
\item conversion of moderately conventional annotation for
genes/gene sets to operators on Bioconductor data structures
is illustrated
\ei
}
\clearpage
{\sts{Is my gene in any pathways?}
\bi
\item case of TBX21
\item KEGG -- nothing
\item NCBI -- nothing
\item What about the Broad GSEA-related gene sets?
\item Bioconductor package GSEABase helps navigate these
\ei
}
\clearpage
{\stsL{broadsets.rda}
<
>=
library(GSEABase)
if (!exists("broadsets")) load("broadsets.rda")
broadsets
class(broadsets)
getClass(class(broadsets))
@
}
\clearpage
{\stsL{information on a set}
<>=
broadsets[[1]]
details(broadsets[[1]])
@
}
\clearpage
{\stsL{GeneSetCollection operations}
\bi
\item a GeneSetCollection instance is an R list of
GeneSets
\item iteration over list elements is relatively
easy in R
\item need to know how to operate usefully on
a GeneSet
\item poked at one above with `details' method
\item another method of interest: geneIds
<>=
geneIds(broadsets[[1]])
allids = lapply(broadsets, geneIds)
tbxchk = sapply(allids, function(x) any(x == "TBX21"))
sum(tbxchk)
hastbx = which(tbxchk)
sapply(broadsets[hastbx], setName)
@
\ei
}
\clearpage
{\stsL{More info}
\bi
\item we have identified 12 'gene sets' that include TBX21
\item what are they? we can see that one is just a cytoband
\ei
<>=
broadsets[["V$LYF1_01"]]
@
}
\clearpage
{\stsL{More info}
<>=
details(broadsets[["V$LYF1_01"]])
longDescription(broadsets[["V$LYF1_01"]])
@
\bi
\item
This longDescription result is not very useful ... it is
a long string of HTML. If we write it to a file, we can run
a browser. Use writeLines
\ei
}
\clearpage
\setkeys{Gin}{width=1.15\textwidth}
\includegraphics{lyf1}
\clearpage
\includegraphics{lyf1b}
\clearpage
{\sts{Upshots}
\bi
\item A collection of over 3000 sets of genes is
bound to a single R variable name (broadsets)
\item Each set is self-documenting and includes a list of
HUGO identifiers (as given by Broad)
\item methods \verb+geneIds+, \verb+details+, \verb+longDescription+
provide uniform information on each set
\item programming expertise useful
\bi
\item general string matching (\texttt{x == 'TBX21'}) or pattern
matching (grep, caseconversion etc) available directly to constituents
\item shortcuts \verb+broadsets[["V$LYF1_01"]]+
\ei
\item exploit sets and their structures for thorough statistical analysis
\ei
}
\clearpage
{\stsL{Application -- note number of features retained}
<>=
library(Biobase)
library(ALL)
data(ALL)
keep = broadsets[["V$LYF1_01"]]
geneIdType(keep) = AnnotationIdentifier(annotation(ALL))
ALL[ geneIds(keep), ]
@
}
\clearpage
{\sts{Four technologies}
\bi
\item the concepts
\bi
\item mRNA transcript abundance -- expression arrays
\item binding of TFs to DNA -- ChIP-chip
\item high-density genotyping -- SNP chips (with expression:
genetical genomics)
\item aCGH and other approaches to copy number variation (CNV)
\ei
\item What do we do with these?
\bi
\item differential expression, coexpression
\item theory of transcriptional regulation
\item genetic association studies
\item assessment of genomic lesions
\ei
\ei
}
\clearpage
<>=
if (!("MAQCsubset" %in% search())) library(MAQCsubset)
if (!exists("afxsubRMAES")) data(afxsubRMAES)
if (!("harbChIP" %in% search())) library(harbChIP)
if (!("Neve2006" %in% search())) library(Neve2006)
if (!("GGtools" %in% search())) library(GGtools)
if (!exists("neveExCGH")) data(neveExCGH)
if (!exists("chr20GGdem")) data(chr20GGdem)
data(harbChIP)
load("scr2.rda")
options(width=60)
@
{
\stsh{Differential expression: an example}
<>=
library(MAQCsubset)
data(afxsubRMAES) # RMA preprocessing, ES = ExpressionSet
plot( exprs(afxsubRMAES)["206253_at",] ~ afxsubRMAES$pctBrain )
@
}
\clearpage
{\sts{Questions}
\bi
\item How do we interpret `206253\_at' in the foregoing?
\item How can we make a useful (hyperlinked) report on
annotation related to the implicated gene?
\ei
}
\clearpage
{\sts{Solutions 1: annotate::lookUp}
<>=
library(hgu133plus2.db)
library(annotate)
lookUp("206253_at", "hgu133plus2", "GENENAME")
@
}
\clearpage
{\sts{Solutions 2: annaffy::aafTableAnn}
<>=
library(annaffy)
library(annotate)
dem = aafTableAnn("206253_at", "hgu133plus2.db")
saveHTML( dem, file="abc.html" )
@
}
\clearpage
\setkeys{Gin}{width=1.05\textwidth}
\includegraphics{probeList}
\clearpage
{
\stsh{ChIP-chip: an example}
}
\setkeys{Gin}{width=.85\textwidth}
<>=
library(harbChIP); data(harbChIP)
par(mfrow=c(2,2))
dim(exprs(harbChIP))
qqnorm(exprs(harbChIP)[,"ABF1"], main="ABF1", pch=".")
qqnorm(exprs(harbChIP)[,"BYE1"], main="BYE1", pch=".")
qqnorm(exprs(harbChIP)[,"GAL80"], main="GAL80", pch=".")
qqnorm(exprs(harbChIP)[,"HAL9"], main="HAL9", pch=".")
par(mfrow=c(1,1))
@
\clearpage
\setkeys{Gin}{width=1.10\textwidth}
\includegraphics{GAL80}
\clearpage
{\stsh{Using the ChIP-chip data}
<>=
library(YEAST)
higal80 = sort(-exprs(harbChIP)[,"GAL80"])[1:10]
unlist(lookUp(names(higal80), "YEAST", "GENENAME"))
library(parody)
calout.detect(na.omit(exprs(harbChIP)[,"GAL80"]))$val
@
}
\clearpage
{\stsh{Genetics of gene expression}
\setkeys{Gin}{width=0.75\textwidth}
<>=
library(GGtools)
data(chr20GGdem)
snps(chr20GGdem)[2310:2314,1:4] # rare allele count
plot(snps(chr20GGdem)["rs6060535",],
exprs(chr20GGdem)["206918_s_at",], main="CPNE1")
@
}
\clearpage
\setkeys{Gin}{width=0.75\textwidth}
\includegraphics{doss}
\clearpage
{\stsh{Questions}
\bi
\item What is known about location and role of SNP rs6060535?
\ei
}
\clearpage
<>=
library(biomaRt)
mm = useMart("snp", dataset="hsapiens_snp")
listFilters(mm)[1:2,]
listAttributes(mm)[1:2,]
@
\clearpage
<>=
getBM(attributes=c("chr_name", "chrom_start", "ensembl_external_gene_id",
"ensembl_type", "ensembl_syn_summary", "snp_allele"),
filters=c("refsnp"), value="rs6060535", mart=mm)
@
\clearpage
{\stsh{chromosomal lesions: example}
<>=
library(Neve2006); data(neveExCGH)
experimentData(neveExCGH)
@
}
\clearpage
<>=
table(neveExCGH$tumorType, neveExCGH$geneCluster)
@
\clearpage
{\sts{chromosomal lesions: example}
<>=
plot(cloneMeta(neveExCGH)$kbGenome,
logRatios(neveExCGH)[,1], pch=".", cex=4,
xlab="kbGenome", ylab="log ratio",
main = neveExCGH$tumorType[1])
@
}
\clearpage
<>=
par(mfrow=c(2,2))
for (i in 1:4)
plot(cloneMeta(neveExCGH)$kbGenome,
logRatios(neveExCGH)[,i], pch=".", cex=4,
xlab="kbGenome", ylab="log ratio",
main = neveExCGH$tumorType[i])
par(mfrow=c(1,1))
@
\clearpage
{\sts{Questions}
\bi
\item How can we use the aCGH log ratios to identify
genomic locations of chromosomal lesions?
\item How can we determine proximity of these lesions to genes?
\item What is a sound approach to understanding joint behavior of
genomic lesions, differential expression, and phenotype?
\ei
}
\clearpage
{\stsh{Common threads}
\bi
\item lots of features (high-throughput), not so many
samples
\item features have complex annotation
\bi
\item probe sequence and its context
\item functional categorization (GO)
\item pathway membership and role (KEGG, etc.)
\ei
\item samples (should) have complex annotation
\bi
\item experimental conditions
\item phenotype, disease state, demographics
\ei
\item analytic agenda is open-ended, integrative
\ei
}
\clearpage
{\stsh{Two motivations to Bioconductor}
\bi
\item Software is a primary tool for discovery in
high-throughput biology
\bi
\item must support methodologists, developers -- reduce
barriers to entry (for methodologist) to biological applications
\item must support/educate/inspire users -- reduce
barriers to entry (for investigators) to new software and
analyses
\ei
\item Many problems and solutions have shared structures
\bi
\item resources can be modified or extended for efficient
reuse
\item examples of reusable resources:
\bi
\item tools for inference on linear models
\item tools for machine learning
\item structures for multicomponent assay outputs
\ei
\ei
\ei
}
\clearpage
\begin{verbatim}
raw
__________
|..______|__
|..|...____|____
|..|...|.......| MD1: map geometry of chip
|..|...|.......| S features to sequence/source
|__|...|.......| per chip
|___|.......| MD2: map sequence to biological
|_______| reference annotation
| MD3: record hybridization
preprocessing metadata including phenotype
| and experimental design
V
--------------------------------------------
N chips -> ExpressionSet instance
\end{verbatim}
\clearpage
\begin{verbatim}
N chips -> ExpressionSet instance
phenoData:
P
id sex disease ...
|----------------------|
AssayData: N | |
exprs N | |
__________________________ | |
| 2.2|... | | | |
| 1.7| | | ------------------------
| .| | | + varMetadata
| .| | |
| .| | |
G | | | | + featureData (probe metadata refs)
| | | | + experimentData (MIAME)
| | | |
| | | |
| | | |
| | | |
__________________________
\end{verbatim}
\clearpage
{\sts{Getting proficient with Bioconductor}
\bi
\item I: Centrality of software
\bi
\item Grasp the importance of independent functional modules
(packages) in composing workflows
\bi
\item in R, packages include functions, data structure
schemas, \textit{documentation}, example data, and
test facilities
\ei
\bi
\item Understand function application -- f(x) can return a
multifaceted object that can describe itself
\item programming shorthand: f(g(x)) instead of \{y=g(x); f(y)\};
x\$y instead of pData(x)[["y"]]
\item applicative and vector-oriented programming
\ei
\item Learn how to design functions that do useful things
that must be done repetitively (with parametrized variations)
\item Think compositionally: the output of one procedure may
be the input to another procedure
\ei
\ei
}
\clearpage
{\sts{Getting proficient with Bioconductor}
\bi
\item II: Component reuse; usage patterns
\bi
\item integrative project: N samples, K technologies, $f_k$ features/technology
\item samples share annotation on phenotype, demographics
\item technologies share annotation across experiments
\item features share annotation relative to genomic context
\item minimize redundancy by factoring data and metadata
into 'familiar' locations
\item efficient use of factored resources requires good
sense of cross-resource linkage
\ei
\ei
}
\clearpage
{\stsh{Container classes}
<>=
getClass("eSet")
@
}
\clearpage
{\stsh{Expression arrays}
<>=
afxsubRMAES
@
}
\clearpage
{\stsh{ChIP-chip archive -- no standard yet}
<>=
harbChIP
@
}
\clearpage
{\stsh{Genetics of gene expression -- early standard}
<>=
chr20GGdem
@
}
\clearpage
{\stsh{CNV + expression -- no standard yet}
<>=
neveExCGH
@
}
\clearpage
{\sts{Summary}
\bi
\item you can explore and manipulate data from major published experiments
with a small number of operators
\item R idioms must be mastered: X[G, S] represents a selection of
features G on samples S in experiment X
\item packaged or improvised analyses use functions and objects; reports
use various kinds of serialization
\item major first hurdles: understanding the containers in conjunction with the annotation system
\ei
}
\end{document}