Skip to contents

Gene expression in TB patients and Healthy controls


This data set has been constructed from the gene expression data set accessible in the Gene Expression Omnibus (GEO) at the accession number GSE28623. Ten healthy donors (NID, non-infected donors) and 10 tubercolosis patients (TB) have been randomly selected from the full data set, and top 25 genes with the highest IQR have been selected for further analysis. Genes without an Entrez gene (EG) identifier have likewise been omitted.

The Egambia object is a data frame. The first three columns are the gene symbol, gene name and Entrez gene (EG) identifier. The remaining columns correspond to the gene expression data.


if (FALSE) {
# The data set has been generated as follows:
# get the data set from GEO
gambia <- getGEO("GSE28623")[[1]]

# Convert to limma and normalize
e <- new("EListRaw", list(E= exprs(gambia), genes=fData(gambia), targets= pData(gambia))) <- backgroundCorrect(e, method= "normexp")
en <- normalizeBetweenArrays(, method= "q")
en <- avereps(en, ID=en$genes$NAME)
en <- en[ en$genes$CONTROL_TYPE == "FALSE", ]
en$targets$group <- factor(gsub(" whole blood RNA *", "", en$targets$description))

# Fill in Entrez Gene IDs
en$genes$EG <- ""
sel <- en$genes$REFSEQ %in% ls(org.Hs.egREFSEQ2EG)
en$genes$EG[sel] <- mget(as.character(en$genes$REFSEQ[sel]), org.Hs.egREFSEQ2EG)

# Filter by IQR and missing EG's
iqrs <- apply(en$E, 1, IQR)
en2 <- en[ iqrs > quantile(iqrs, 0.75) & en$genes$EG != "", ]

# Select 10 random samples from NID and TB groups
en2 <- en2[ , c(sample(which(en2$targets$group == "NID"), 10), 
                 sample(which(en2$targets$group == "TB"), 10)) ]
colnames(en2$E) <- en2$targets$group
Egambia <- cbind(en2$genes[ , c("GENE_SYMBOL", "GENE_NAME", "EG") ], en2$E)