library(cluster) #### question 1 simu<-function(n,delta) { tab<-array(data=c(rnorm(8*n), rep(c(-delta,-delta,delta,delta),n), rep(c(-delta,delta,-delta,delta),n),rep(c(1,2,3,4),n)),dim=c(4*n,5)) # les 2 dernières colonnes indiquent l'esperance et la 5eme indique le groupe # on corrige les esperances tab[,1]=tab[,1]+tab[,3] tab[,2]=tab[,2]+tab[,4] o=order(tab[,5]) return(tab[o,]) } n=5 delta=3 donnees<-simu(n,delta) daisy(donnees) #### Fonction agnes cah1 <- agnes(donnees[,1:2], metric = "euclidean", method="ward",stand = FALSE) cah1 plot(cah1) pltree(cah1) # structure du dendrogram structure d2 <- as.dendrogram(cah1) # two main branches d2[[1]] # the first branch d2[[2]] # the 2nd one d2[[1]][[1]]# first sub-branch of branch 1 .. and shorter form identical(d2[[c(1,1)]],d2[[1]][[1]]) ## a "textual picture" of the dendrogram : str(d2) # plot(cah1\$height, type = 'l') cutree(cah1,k=4) plot(donnees[,1:2], pch=c(20,21,22,19)[cutree(cah1,k=4)], col=c("red","blue","orange", "black")[cutree(cah1,k=4)]) table(cutree(cah1,k=4),donnees[,5]) ## rapport variance inter/ variance intra en fonction du nbre de classes nbclassmax=8 resultats=array(data=c(0), dim=c(nbclassmax)) g=array(data=c(mean(donnees[,1]), mean(donnees[,2])),dim=c(2,1)) vartotale=sum((donnees[,1:2]-array(data=c(1), dim=(c(20,1)))%*%t(g))^2)/20 for(k in 1:nbclassmax) { classes=cutree(cah1,k=k) e=array(data=c(1), dim=c(k,1)) poids=table(classes) centres=array(data=c(sapply(1:k,function(i) c(mean(donnees[classes==i,1]),mean(donnees[classes==i,2])))), dim=c(2,k)) varinter=sum(rowSums((t(centres)-(e%*%t(g)))^2)*as.vector(poids))/sum(poids) matcentres=t(array(data=centres[,classes], dim=c(2,4*n))) varintra=sum((donnees[,1:2]-matcentres)^2)/sum(poids) resultats[k]=(varinter/varintra)*(varintra>0) } plot(resultats,type='l') ## question 5 : avec d'autres distances op <- par(mfrow=c(2,2)) cah2 <- agnes(daisy(donnees), diss = TRUE, method = "complete") plot(cah2) cahS <- agnes(donnees, method = "flexible", par.meth = 0.6) plot(cahS) par(op) ## question 6 : Etude par Monte Carlo classif<-function(n,delta,k) { donnees<-simu(n,delta) cah <- agnes(donnees, metric = "euclidean", method="ward",stand = FALSE) tab=table(cutree(cah,k=k),donnees[,5]) return(sum(diag(tab))/sum(tab)) } N=100 n=20 delta=0.2 resultats<-array(data=c(0), dim=c(N)) for(i in 1:N) { resultats[i]=classif(n,delta,4) } hist(resultats, prob=TRUE) lines(density(resultats)) summary(resultats) sd(resultats) abline(v=mean(resultats), col="red", lwd="3") #### Fonction kmeans n=5 delta=3 donnees<-simu(n,delta) k=4 km=kmeans(donnees[,1:2],k) km\$cluster km\$centers # variance totale (non renormalisée) km\$totss # variance intra km\$withinss km\$tot.withinss sum(km\$withinss) # variace inter km\$betweenss km\$betweens+km\$tot.withinss km\$size # alea de la classification ? k=4 donnees<-simu(100,0.2) km=kmeans(donnees[,1:2],k) km2=kmeans(donnees[,1:2],k) table(km\$cluster, km2\$cluster) # Etude du rapport varinter/varintra en fonction du nbre de classes k nbclassmax=8 resultats=array(data=c(0), dim=c(nbclassmax)) for(k in 1:nbclassmax) { km=kmeans(donnees[,1:2],k) resultats[k]=(km\$betweenss/km\$tot.withinss)*(km\$tot.withinss>0) } plot(resultats,type='l') # Etude par Monte-Carlo classif<-function(n,delta,k) { donnees<-simu(n,delta) km=kmeans(donnees,k) tab=table(km\$cluster,donnees[,5]) return(sum(diag(tab))/sum(tab)) } N=100 n=20 delta=0.2 resultats<-array(data=c(0), dim=c(N)) for(i in 1:N) { resultats[i]=classif(n,delta,4) } hist(resultats, prob=TRUE) lines(density(resultats)) summary(resultats) sd(resultats) abline(v=mean(resultats), col="red", lwd="3") ##### Iris de Fisher data(iris) # classification avec kmeans km=kmeans(iris[,1:4],3) plot(iris[,1:2], col=c("blue","red","orange")[km\$cluster],pch=c(20,22,19)[km\$cluster]) plot(iris[,3:4], col=c("blue","red","orange")[km\$cluster],pch=c(20,22,19)[km\$cluster]) acp=princomp(iris[,1:4],cor=TRUE) acp acp\$loadings # acp\$scores contient les coordonnees des individus sur les différents axes factoriels biplot(acp, choices=1:2) plot(acp\$scores[,1:2],col=c("blue","red","orange")[km\$cluster],pch=c(20,22,19)[km\$cluster]) km2=kmeans(acp\$scores[,1:2],3) table(km\$cluster, km2\$cluster) points(acp\$scores[,1:2],col=c("blue","red","orange")[km2\$cluster],pch=c(1,2,5)[km2\$cluster],cex=3)