Data Description

2011-12 NBA球季,ESPN選出的前25名球星,包含

等數據。我們可以集群分析來將25名球星歸類。

函數說明

#dist() 計算距離矩陣
#hclust()集群分析
#cutree()指定歸類為n群
#pvclust()計算p值的集群分析
#t()行列互換

1.設定所需的函式庫(libraries)以及載入資料

#install.packages("pvclust");
library(pvclust) #計算集群分析與P值之套件

setwd("d:/Rdata Practice")
nba<-read.csv("2011NBA.csv", header=T, sep=",")
nba[1:5,]
##          player division  ppg  rpg apg  blk  stl
## 1  LeBron James     East 27.1  7.9 6.2 0.81 1.85
## 2  Kevin Durant     West 28.0  8.0 3.5 1.17 1.33
## 3 Dwight Howard     East 20.6 14.5 1.9 2.15 1.50
## 4   Christ Paul     West 19.8  3.6 9.1 0.07 2.53
## 5  Derrick Rose     East 21.8  3.4 7.9 0.72 0.90

2.集群分析

nba.clust<-nba[,-c(1,2)]#排除資料中的player與division文字變數,方便計算歐幾里得距離
means<-apply(nba.clust, 2, mean)#標準化變數,計算每攔變數的平均數,2代表column,1代表row
sds<-apply(nba.clust, 2, sd)#標準化變數,計算每欄變數的標準差,2代表column,1代表row
nba.clust<-scale(nba.clust, center=means, scale=sds)#標準化變數,計算Z分數
nba.dist<-dist(nba.clust,, method="euclidean")#計算歐幾里得距離
nba.fit<-hclust(nba.dist, method="ward.D")#以Ward法進行集群分析
#The "ward" method has been renamed to "ward.D"; note new "ward.D2"

plot(nba.fit, labels=nba$player, main="2011-12 NBA TOP25")#繪製集群分析樹狀圖
rect.hclust(nba.fit, k=5, border="red") #以紅線指定5個集群

cluster5<-cutree(nba.fit, k=5) #指定集群分析為5群
nba$player[cluster5==1] #呼叫屬於第一集群的分析結果
## [1] LeBron James      Kevin Durant      Kobe Bryant       Dwyane Wade      
## [5] Russell Westbrook
## 25 Levels: Andrew Bynum Blake Griffin Carmelo Anthony ... Tyson Chandler
sapply(unique(cluster5), function(a)nba$player[cluster5==a])#呼叫全部集群分析結果
## [[1]]
## [1] LeBron James      Kevin Durant      Kobe Bryant       Dwyane Wade      
## [5] Russell Westbrook
## 25 Levels: Andrew Bynum Blake Griffin Carmelo Anthony ... Tyson Chandler
## 
## [[2]]
## [1] Dwight Howard  Andrew Bynum   Pau Gasol      Tyson Chandler
## [5] Marc Gasol    
## 25 Levels: Andrew Bynum Blake Griffin Carmelo Anthony ... Tyson Chandler
## 
## [[3]]
## [1] Christ Paul Rajon Rondo
## 25 Levels: Andrew Bynum Blake Griffin Carmelo Anthony ... Tyson Chandler
## 
## [[4]]
## [1] Derrick Rose   Deron Williams Tony Parker    Steve Nash    
## [5] Kyrie Irving   Manu Ginobili 
## 25 Levels: Andrew Bynum Blake Griffin Carmelo Anthony ... Tyson Chandler
## 
## [[5]]
## [1] Kevin Love        Dirk Nowitzki     Blake Griffin     Carmelo Anthony  
## [5] Chris Bosh        LaMarcus Aldridge Kevin Garnett    
## 25 Levels: Andrew Bynum Blake Griffin Carmelo Anthony ... Tyson Chandler
nba.new<-cbind(nba, cbind(cluster5))#將集群分析產生的新變數cluster5,與原資料nba合併
nba.new$cluster5<-factor(nba.new$cluster5, levels=c(1:5), labels=c("scorer", "defender", "point guard", "combo guard", "power forward"))#將五個集群依序命名
table(nba.new$division, nba.new$cluster5)#比較NBA東西區的集群分布
##       
##        scorer defender point guard combo guard power forward
##   East      2        2           1           3             3
##   West      3        3           1           3             4
nba.new
##               player division  ppg  rpg  apg  blk  stl      cluster5
## 1       LeBron James     East 27.1  7.9  6.2 0.81 1.85        scorer
## 2       Kevin Durant     West 28.0  8.0  3.5 1.17 1.33        scorer
## 3      Dwight Howard     East 20.6 14.5  1.9 2.15 1.50      defender
## 4        Christ Paul     West 19.8  3.6  9.1 0.07 2.53   point guard
## 5       Derrick Rose     East 21.8  3.4  7.9 0.72 0.90   combo guard
## 6        Kobe Bryant     West 27.9  5.4  4.6 0.31 1.19        scorer
## 7         Kevin Love     West 26.0 13.3  2.0 0.51 0.85 power forward
## 8        Dwyane Wade     East 22.1  4.6  4.8 1.29 1.67        scorer
## 9  Russell Westbrook     West 23.6  4.6  5.5 0.30 1.70        scorer
## 10    Deron Williams     East 21.0  3.3  8.7 0.36 1.22   combo guard
## 11     Dirk Nowitzki     West 21.6  6.7  2.2 0.48 0.68 power forward
## 12       Rajon Rondo     East 11.9  4.9 11.7 0.06 1.79   point guard
## 13      Andrew Bynum     West 18.7 11.8  1.4 1.93 0.45      defender
## 14     Blake Griffin     West 20.7 10.9  3.2 0.73 0.82 power forward
## 15         Pau Gasol     West 17.4 10.4  3.7 1.35 0.57      defender
## 16       Tony Parker     West 18.3  2.9  7.7 0.08 0.95   combo guard
## 17   Carmelo Anthony     East 22.6  6.3  3.6 0.44 1.13 power forward
## 18        Chris Bosh     East 18.0  7.9  1.8 0.79 0.89 power forward
## 19        Steve Nash     West 12.5  3.0 10.7 0.13 0.61   combo guard
## 20 LaMarcus Aldridge     West 21.7  8.0  2.4 0.82 0.93 power forward
## 21     Kevin Garnett     East 15.8  8.2  2.9 1.02 0.93 power forward
## 22      Kyrie Irving     East 18.5  3.7  5.4 0.39 1.06   combo guard
## 23    Tyson Chandler     East 11.3  9.9  0.9 1.40 0.90      defender
## 24        Marc Gasol     West 14.6  8.9  3.1 1.86 0.95      defender
## 25     Manu Ginobili     West 12.9  3.4  4.4 0.35 0.71   combo guard

我們透過集群分析,將NBA 25位球星分為五種類型,分別是得分主力、防守悍將、控球後衛、雙能衛與強力前鋒。集群分析是希望能使群內同質性高,群間異質性大。可以簡單以變異數分析,看看是否達到集群分析目的。

3.變異數分析

anova(lm(nba.new$ppg~factor(cluster5)))#平均得分與球員類型單因子變異數分析
## Analysis of Variance Table
## 
## Response: nba.new$ppg
##                  Df Sum Sq Mean Sq F value   Pr(>F)   
## factor(cluster5)  4 301.83  75.458  5.8769 0.002698 **
## Residuals        20 256.79  12.840                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4.集群分析與P值

#pvclust套件,提供pvclust()函數計算au(Approximately Unbiased)P值 及bp((Bootstrap Probability)
nbaclust.pvalue<-pvclust(t(nba.clust), method.hclust="ward.D", method.dist="euclidean") #pvclust集群分析以欄為分類基礎,因此先用t()函數行列互換
## Bootstrap (r = 0.4)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.4)... Done.
plot(nbaclust.pvalue, labels=nba$player, main="2011-12 NBA TOP25 with P-value")