Bioops

Bioinformatics=(ACGAAG->AK)+(#!/bin/sh)+(P(A|B)=P(B|A)*P(A)/P(B))

Demo of Classification

| Comments

R code demo of

  1. Linear discriminant analysis (LDA)
  2. Quadratic discriminant analysis (QDA)
  3. k-nearest neighbor (KNN)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# read data
library(RCurl)
data <- getURL("https://raw.githubusercontent.com/bioops/mis_scripts/master/statistics/data/admission.txt")
admission<-read.table(text=data,header=T)
admission$CLASS<-as.factor(admission$CLASS)

# scale the data
admission$GMAT<-scale(admission$GMAT)
admission$GPA<-scale(admission$GPA)

# (1) LDA
library(MASS)
lda.fit<-lda(CLASS~GMAT+GPA,data=admission)
lda.pred<-predict(lda.fit)
# plot
# different symbols represents true classifications
# different colors are predicted classifications
plot(admission[,1:2],col=lda.pred$class, pch=as.numeric(admission$CLASS),
     xlab="GPA",ylab="GMAT",main="LDA")

LDA

1
2
3
4
5
6
7
8
# (2) QDA
qda.fit<-qda(CLASS~GMAT+GPA,data=admission)
qda.pred<-predict(qda.fit)
# plot
# different symbols represents true classifications
# different colors are predicted classifications
plot(admission[,1:2],col=qda.pred$class, pch=as.numeric(admission$CLASS),
     xlab="GPA",ylab="GMAT",main="QDA")

QDA

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
library(class)

# tune parameter using cross-validation (CV)
k<-seq(1,15,1) # different k
cv.err<-NULL # cv error
for (ki in k){
  knn.pred.cv<-knn.cv(admission[,1:2],admission[,3],k=ki)
  cv.err<-c(cv.err, mean(knn.pred.cv!=admission[,3]))
}
plot(k,cv.err, main="CV error vs k")

# using the optimal parameter
knn.pred<-knn(admission[,1:2],admission[,1:2], admission[,3],k=k[which.min(cv.err)])

# plot
# different symbols represents true classifications
# different colors are predicted classifications
plot(admission[,1:2],col=knn.pred, pch=as.numeric(admission$CLASS),
     xlab="GPA",ylab="GMAT",main="KNN")

KNN

Comments