Data Description

[įŦŦ一įĩ„į¯„äž‹]是內åģēčŗ‡æ–™é›† Diamonds。~54,000 round diamonds from http://www.diamondse.info/

[įŦŦäēŒįĩ„į¯„äž‹]是內åģēčŗ‡æ–™é›† WorldPhones。

[č¨­åŽšæ‰€éœ€įš„å‡ŊåŧåēĢ(libraries)äģĨ及čŧ‰å…Ĩčŗ‡æ–™]

install.packages("ggplot2")
install.packages("ggplot2movies")
library(ggplot2)
library(reshape2)
library(ggplot2movies)

data(diamonds) #įŦŦ一įĩ„data
data(WorldPhones) #įŦŦäēŒįĩ„data
data(movies)  #įŦŦ三įĩ„data

ggplot(data=diamonds, aes(x=cut)) + geom_bar() #a quick example 

PART 1.三大圖型æĻ‚čŋ°

[į•Ģ圖åŸēæœŦæĻ‚åŋĩ]äŊ čĻæąē厚到åē•æ˜¯čĻFactor or Numeric?

[įŦŦä¸€å¤§éĄž] bar plot

str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
head(diamonds)
##   carat       cut color clarity depth table price    x    y    z
## 1  0.23     Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 2  0.21   Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
## 3  0.23      Good     E     VS1  56.9    65   327 4.05 4.07 2.31
## 4  0.29   Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
## 5  0.31      Good     J     SI2  63.3    58   335 4.34 4.35 2.75
## 6  0.24 Very Good     J    VVS2  62.8    57   336 3.94 3.96 2.48
summary(diamonds)
##      carat               cut        color        clarity     
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655  
##                                     J: 2808   (Other): 2531  
##      depth           table           price             x         
##  Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   : 0.000  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2401   Median : 5.700  
##  Mean   :61.75   Mean   :57.46   Mean   : 3933   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540  
##  Max.   :79.00   Max.   :95.00   Max.   :18823   Max.   :10.740  
##                                                                  
##        y                z         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 4.720   1st Qu.: 2.910  
##  Median : 5.710   Median : 3.530  
##  Mean   : 5.735   Mean   : 3.539  
##  3rd Qu.: 6.540   3rd Qu.: 4.040  
##  Max.   :58.900   Max.   :31.800  
## 
ggplot(data=diamonds, aes(x=cut)) + geom_bar() #x must be of type factor

ggp <- ggplot(data=diamonds[1:5,], aes(x=cut)) + geom_bar() #geom_bar drops category with no occurrence
ggp

ggplot(data=diamonds, aes(x=cut)) + geom_bar() + coord_flip() #horizontal bar

ggp <- ggp + xlab("Cut") + ylab("Count") + ggtitle("Hello ggplot!") #change label naming
ggp

ggp + geom_bar(fill="snow", color="black") # change color and see colors() if you're picky

1-1.我們可äģĨäē‹å…ˆåšä¸€éģžéģžč¨ˆįŽ—(plot counts as is )

diamonds_precounted <- as.data.frame(table(diamonds$cut, dnn=c("Cut")))
diamonds_precounted
##         Cut  Freq
## 1      Fair  1610
## 2      Good  4906
## 3 Very Good 12082
## 4   Premium 13791
## 5     Ideal 21551
ggplot(diamonds_precounted, aes(x=Cut, y=Freq)) + geom_bar(stat="identity") # default is "bin"

1-2.有關æ–ŧstat=“identity”

A.row should be unique: otherwise counts will be summed up

B.missing label will be present at default: differ from stat=“bin”

C.negative bar is allowed

diamonds[1:5,]
##   carat     cut color clarity depth table price    x    y    z
## 1  0.23   Ideal     E     SI2  61.5    55   326 3.95 3.98 2.43
## 2  0.21 Premium     E     SI1  59.8    61   326 3.89 3.84 2.31
## 3  0.23    Good     E     VS1  56.9    65   327 4.05 4.07 2.31
## 4  0.29 Premium     I     VS2  62.4    58   334 4.20 4.23 2.63
## 5  0.31    Good     J     SI2  63.3    58   335 4.34 4.35 2.75
ggplot(diamonds[1:5,], aes(x=cut, y=depth)) + geom_bar(stat="identity")

ggplot(diamonds_precounted, aes(x=reorder(Cut, -Freq), y=Freq)) + 
  geom_bar(stat='identity') # The order is determined by factor levels

1-3.stack grouping

ggplot(data=diamonds, aes(x=color, fill=cut)) + geom_bar() #by fill

ggplot(data=diamonds, aes(x=color, color=cut)) + geom_bar() #by color

ggplot(data=diamonds, aes(x=color, fill=cut)) + geom_bar(position="dodge") #dodge grouping

1-4.From bar to histogram

when x is numeric

ggplot(data=diamonds, aes(x=price)) + geom_bar()

ggplot(data=diamonds, aes(x=price, fill=cut)) + geom_bar(position="stack") #by stack

ggplot(data=diamonds, aes(x=price, fill=cut)) + geom_bar(position="identity", alpha=.5) #by identity (overlapping)

ggplot(data=diamonds, aes(x=price, fill=cut)) + geom_density(position="identity", alpha=.5) #from histogram to density plot

ggplot(data=diamonds[diamonds$cut %in% c("Fair", "Ideal"),], aes(x=price, fill=cut)) + 
    geom_density(position="identity", alpha=.5) + 
    geom_bar(position="identity", alpha=.5) #both histogram and density (scale wrong?K)

[įŦŦäēŒå¤§éĄž] Line Graph

head(WorldPhones) #įŦŦäēŒįĩ„dataåŸēæœŦæĻ‚čŋ°
##      N.Amer Europe Asia S.Amer Oceania Africa Mid.Amer
## 1951  45939  21574 2876   1815    1646     89      555
## 1956  60423  29990 4708   2568    2366   1411      733
## 1957  64721  32510 5230   2695    2526   1546      773
## 1958  68484  35218 6662   2845    2691   1663      836
## 1959  71799  37598 6856   3000    2868   1769      911
## 1960  76036  40341 8220   3145    3054   1905     1008
str(WorldPhones)
##  num [1:7, 1:7] 45939 60423 64721 68484 71799 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:7] "1951" "1956" "1957" "1958" ...
##   ..$ : chr [1:7] "N.Amer" "Europe" "Asia" "S.Amer" ...
#Remember: ggplot eat only data.frames
WorldPhones.DF <- as.data.frame(WorldPhones)
WorldPhones.DF$year <- rownames(WorldPhones.DF)
class(WorldPhones.DF) # this time we should be fine!
## [1] "data.frame"

2-1.比čŧƒä¸€ä¸‹é€™å…Šį¨Žæ–šæŗ•æœ‰äģ€éēŧ不同

ggplot(WorldPhones.DF, aes(x=year, y=Asia)) + geom_line(size=1.5)

ggplot(WorldPhones.DF, aes(x=as.numeric(year), y=Asia)) + geom_line(size=1.5)

2-2.長表與å¯Ŧ表之čŊ‰æ›

WP=WorldPhones.DF[,c(1:3,8)]
head(WP,8) #Wide format
##      N.Amer Europe Asia year
## 1951  45939  21574 2876 1951
## 1956  60423  29990 4708 1956
## 1957  64721  32510 5230 1957
## 1958  68484  35218 6662 1958
## 1959  71799  37598 6856 1959
## 1960  76036  40341 8220 1960
## 1961  79831  43173 9053 1961
WP.long=melt(WP,id.vars='year') # ?Hmelt?N?????ā´Ģ??Long Format?A?Hid.vars?O?d????
colnames(WP.long)= c('Year','Region','Value')
head(WP.long) #Long format with melt
##   Year Region Value
## 1 1951 N.Amer 45939
## 2 1956 N.Amer 60423
## 3 1957 N.Amer 64721
## 4 1958 N.Amer 68484
## 5 1959 N.Amer 71799
## 6 1960 N.Amer 76036

2-3.各į¨ŽLineįš„不同

WP.long$Year <- as.integer(as.character(WP.long$Year))
ggplot(WP.long, aes(x=Year, y=Value, color=Region)) + geom_line(size=1.5)

ggplot(WP.long, aes(x=Year, y=Value, linetype=Region))+
  geom_line(size=1.5)

ggplot(WP.long,aes(x=factor(Year), y=Value))+
  geom_line(aes(linetype=Region,group=Region),size=1.5)

ggplot(WP.long, aes(x=Year, y=Value, linetype=Region)) + geom_line(size=1.5) +
  guides(linetype=guide_legend(reverse=TRUE))

[įŦŦä¸‰å¤§éĄž] Scatter Plot

head(movies) #įŦŦ三įĩ„dataåŸēæœŦæĻ‚čŋ°
##                      title year length budget rating votes   r1   r2  r3
## 1                        $ 1971    121     NA    6.4   348  4.5  4.5 4.5
## 2        $1000 a Touchdown 1939     71     NA    6.0    20  0.0 14.5 4.5
## 3   $21 a Day Once a Month 1941      7     NA    8.2     5  0.0  0.0 0.0
## 4                  $40,000 1996     70     NA    8.2     6 14.5  0.0 0.0
## 5 $50,000 Climax Show, The 1975     71     NA    3.4    17 24.5  4.5 0.0
## 6                    $pent 2000     91     NA    4.3    45  4.5  4.5 4.5
##     r4   r5   r6   r7   r8   r9  r10 mpaa Action Animation Comedy Drama
## 1  4.5 14.5 24.5 24.5 14.5  4.5  4.5           0         0      1     1
## 2 24.5 14.5 14.5 14.5  4.5  4.5 14.5           0         0      1     0
## 3  0.0  0.0 24.5  0.0 44.5 24.5 24.5           0         1      0     0
## 4  0.0  0.0  0.0  0.0  0.0 34.5 45.5           0         0      1     0
## 5 14.5 14.5  4.5  0.0  0.0  0.0 24.5           0         0      0     0
## 6 14.5 14.5 14.5  4.5  4.5 14.5 14.5           0         0      0     1
##   Documentary Romance Short
## 1           0       0     0
## 2           0       0     0
## 3           0       0     1
## 4           0       0     0
## 5           0       0     0
## 6           0       0     0
str(movies)
## Classes 'tbl_df', 'tbl' and 'data.frame':    58788 obs. of  24 variables:
##  $ title      : chr  "$" "$1000 a Touchdown" "$21 a Day Once a Month" "$40,000" ...
##  $ year       : int  1971 1939 1941 1996 1975 2000 2002 2002 1987 1917 ...
##  $ length     : int  121 71 7 70 71 91 93 25 97 61 ...
##  $ budget     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ rating     : num  6.4 6 8.2 8.2 3.4 4.3 5.3 6.7 6.6 6 ...
##  $ votes      : int  348 20 5 6 17 45 200 24 18 51 ...
##  $ r1         : num  4.5 0 0 14.5 24.5 4.5 4.5 4.5 4.5 4.5 ...
##  $ r2         : num  4.5 14.5 0 0 4.5 4.5 0 4.5 4.5 0 ...
##  $ r3         : num  4.5 4.5 0 0 0 4.5 4.5 4.5 4.5 4.5 ...
##  $ r4         : num  4.5 24.5 0 0 14.5 14.5 4.5 4.5 0 4.5 ...
##  $ r5         : num  14.5 14.5 0 0 14.5 14.5 24.5 4.5 0 4.5 ...
##  $ r6         : num  24.5 14.5 24.5 0 4.5 14.5 24.5 14.5 0 44.5 ...
##  $ r7         : num  24.5 14.5 0 0 0 4.5 14.5 14.5 34.5 14.5 ...
##  $ r8         : num  14.5 4.5 44.5 0 0 4.5 4.5 14.5 14.5 4.5 ...
##  $ r9         : num  4.5 4.5 24.5 34.5 0 14.5 4.5 4.5 4.5 4.5 ...
##  $ r10        : num  4.5 14.5 24.5 45.5 24.5 14.5 14.5 14.5 24.5 4.5 ...
##  $ mpaa       : chr  "" "" "" "" ...
##  $ Action     : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Animation  : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ Comedy     : int  1 1 0 1 0 0 0 0 0 0 ...
##  $ Drama      : int  1 0 0 0 0 1 1 0 1 0 ...
##  $ Documentary: int  0 0 0 0 0 0 0 1 0 0 ...
##  $ Romance    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Short      : int  0 0 1 0 0 0 0 1 0 0 ...
movies1 <- movies[!is.na(movies$budget),]
ggplot(movies1, aes(x=budget, y=rating)) + geom_point()

3-1.控åˆļpointįš„各į¨Žåž‹æ…‹

ggplot(movies1, aes(x=budget, y=rating)) + geom_point(shape=5, size=3)

3-2.Grouping

ggplot(movies1, aes(x=budget, y=rating, color=Action)) + geom_point() #Grouping: by binary variable

ggplot(movies1, aes(x=budget, y=rating, color=factor(Action))) + 
    geom_point() + labs(color='Action Movie?') #Grouping: by categarical variable

ggplot(movies1, aes(x=budget, y=rating, color=factor(Action), shape=(length > 120))) + 
    geom_point(size=3) + labs(color='Action Movie?') #Multi-grouping

3-3.regression line

ggplot(movies, aes(x=votes, y=rating)) + geom_point() +
    stat_smooth(method=lm, level=.95) # add se=FALSE to disable CI

ggplot(movies, aes(x=votes, y=rating)) + geom_point() + stat_smooth() #The default is a polynomial fit

ggplot(movies1, aes(x=budget, y=rating, color=factor(Action))) + 
    geom_point() + labs(color='Action Movie?') + stat_smooth(method=lm, se=FALSE) #Fitting is smart to align with grouping

PART 2.åŸēæœŦįĩąč¨ˆé‹įŽ—

2-1-1. Which Type of Film Cost the Most in Average?

movietype <- colnames(movies)[18:24]
movies1_singletype <- movies1[rowSums(movies1[, movietype]) == 1,] # remove multi-typed
movietype_alt <- c(movietype[length(movietype)], movietype[-length(movietype)]) 
# convert multiple dummies into one factor as grouping var
# a little matrix operation will do the trick
dummies <- as.matrix(movies1_singletype[, movietype_alt])
movies1_singletype$Type <- factor(dummies %*% (1:length(movietype_alt)), labels=movietype_alt)

# Compute the Average Budget of Each Type
tapply(movies1_singletype$budget, movies1_singletype$Type, mean)
##       Short      Action   Animation      Comedy       Drama Documentary 
##    396133.1  32698189.6  32311451.6  11921970.6  10456690.5    729704.8 
##     Romance 
##   5603688.0

2-1-2. Determine the variation

#The first factor level of movietype, Short, is represented as the intercept term.
lmfit <- lm(as.formula("budget ~ Type"), movies1_singletype)
summary(lmfit)$coef 
##                   Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)       396133.1    1715935  0.2308556 8.174455e-01
## TypeAction      32302056.5    2062063 15.6649187 7.005190e-53
## TypeAnimation   31915318.5    4157723  7.6761525 2.316874e-14
## TypeComedy      11525837.5    1888686  6.1025705 1.202481e-09
## TypeDrama       10060557.4    1820528  5.5261750 3.604075e-08
## TypeDocumentary   333571.7    2881175  0.1157763 9.078389e-01
## TypeRomance      5207554.9    3713295  1.4024082 1.609149e-01

2-1-3. Another way to estimate the coefficients

#The last predictor, Short is combined into the intercept term.
# mean(movies1_singletype[movies1_singletype$Animation == 1, 'budget'])
lmfit <- lm(as.formula(paste('budget ~', paste(movietype, collapse=' + '))), 
            movies1_singletype)
summary(lmfit)$coef
##               Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)   396133.1    1715935  0.2308556 8.174455e-01
## Action      32302056.5    2062063 15.6649187 7.005190e-53
## Animation   31915318.5    4157723  7.6761525 2.316874e-14
## Comedy      11525837.5    1888686  6.1025705 1.202481e-09
## Drama       10060557.4    1820528  5.5261750 3.604075e-08
## Documentary   333571.7    2881175  0.1157763 9.078389e-01
## Romance      5207554.9    3713295  1.4024082 1.609149e-01

2-1-4. Draw the regression lines of each type

#What is the association between cost and rating, conditional on type?

movies1_reg_plot <- ggplot(movies1_singletype, aes(x=budget, y=rating, color=Type)) + 
  geom_point(shape=1) +
  
  # set fullrange=T will extend the fitted line outside the sample range
  stat_smooth(method=lm, se=FALSE, fullrange=FALSE, size=1.5) +
  
  # color is the grouping interface, hence scale_color_*
  scale_color_discrete(name='Movie Type: # of samples', 
                       labels=paste(levels(movies1_singletype$Type), ': ', 
                                    table(movies1_singletype$Type)))

2-1-5.Output

movies1_reg_plot