Gene expression in TB patients and Healthy controls

Details

This data set has been constructed from the gene expression data set accessible in the Gene Expression Omnibus (GEO) at the accession number GSE28623. Ten healthy donors (NID, non-infected donors) and 10 tubercolosis patients (TB) have been randomly selected from the full data set, and top 25 genes with the highest IQR have been selected for further analysis. Genes without an Entrez gene (EG) identifier have likewise been omitted.

The Egambia object is a data frame. The first three columns are the gene symbol, gene name and Entrez gene (EG) identifier. The remaining columns correspond to the gene expression data.

Examples

if (FALSE) {
# The data set has been generated as follows:
# get the data set from GEO
library(GEOquery)
gambia <- getGEO("GSE28623")[[1]]

# Convert to limma and normalize
library(limma)
e <- new("EListRaw", list(E= exprs(gambia), genes=fData(gambia), targets= pData(gambia)))
e.bg <- backgroundCorrect(e, method= "normexp")
en <- normalizeBetweenArrays(e.bg, method= "q")
en <- avereps(en, ID=en$genes$NAME)
en <- en[ en$genes$CONTROL_TYPE == "FALSE", ]
en$targets$group <- factor(gsub(" whole blood RNA *", "", en$targets$description))

# Fill in Entrez Gene IDs
library(org.Hs.eg.db)
en$genes$EG <- ""
sel <- en$genes$REFSEQ %in% ls(org.Hs.egREFSEQ2EG)
en$genes$EG[sel] <- mget(as.character(en$genes$REFSEQ[sel]), org.Hs.egREFSEQ2EG)

# Filter by IQR and missing EG's
iqrs <- apply(en$E, 1, IQR)
en2 <- en[ iqrs > quantile(iqrs, 0.75) & en$genes$EG != "", ]

# Select 10 random samples from NID and TB groups
en2 <- en2[ , c(sample(which(en2$targets$group == "NID"), 10), 
                 sample(which(en2$targets$group == "TB"), 10)) ]
colnames(en2$E) <- en2$targets$group
Egambia <- cbind(en2$genes[ , c("GENE_SYMBOL", "GENE_NAME", "EG") ], en2$E)
}