PART 1.ä¸å¤§ååæĻčŋ°
[įĢååēæŦæĻåŋĩ]äŊ čĻæąēåŽå°åēæ¯čĻFactor or Numeric?
[įŦŦä¸å¤§éĄ] bar plot
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame': 53940 obs. of 10 variables:
## $ carat : num 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
head(diamonds)
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
summary(diamonds)
## carat cut color clarity
## Min. :0.2000 Fair : 1610 D: 6775 SI1 :13065
## 1st Qu.:0.4000 Good : 4906 E: 9797 VS2 :12258
## Median :0.7000 Very Good:12082 F: 9542 SI2 : 9194
## Mean :0.7979 Premium :13791 G:11292 VS1 : 8171
## 3rd Qu.:1.0400 Ideal :21551 H: 8304 VVS2 : 5066
## Max. :5.0100 I: 5422 VVS1 : 3655
## J: 2808 (Other): 2531
## depth table price x
## Min. :43.00 Min. :43.00 Min. : 326 Min. : 0.000
## 1st Qu.:61.00 1st Qu.:56.00 1st Qu.: 950 1st Qu.: 4.710
## Median :61.80 Median :57.00 Median : 2401 Median : 5.700
## Mean :61.75 Mean :57.46 Mean : 3933 Mean : 5.731
## 3rd Qu.:62.50 3rd Qu.:59.00 3rd Qu.: 5324 3rd Qu.: 6.540
## Max. :79.00 Max. :95.00 Max. :18823 Max. :10.740
##
## y z
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.720 1st Qu.: 2.910
## Median : 5.710 Median : 3.530
## Mean : 5.735 Mean : 3.539
## 3rd Qu.: 6.540 3rd Qu.: 4.040
## Max. :58.900 Max. :31.800
##
ggplot(data=diamonds, aes(x=cut)) + geom_bar() #x must be of type factor
ggp <- ggplot(data=diamonds[1:5,], aes(x=cut)) + geom_bar() #geom_bar drops category with no occurrence
ggp
ggplot(data=diamonds, aes(x=cut)) + geom_bar() + coord_flip() #horizontal bar
ggp <- ggp + xlab("Cut") + ylab("Count") + ggtitle("Hello ggplot!") #change label naming
ggp
ggp + geom_bar(fill="snow", color="black") # change color and see colors() if you're picky
1-1.æåå¯äģĨäēå
åä¸éģéģč¨įŽ(plot counts as is )
diamonds_precounted <- as.data.frame(table(diamonds$cut, dnn=c("Cut")))
diamonds_precounted
## Cut Freq
## 1 Fair 1610
## 2 Good 4906
## 3 Very Good 12082
## 4 Premium 13791
## 5 Ideal 21551
ggplot(diamonds_precounted, aes(x=Cut, y=Freq)) + geom_bar(stat="identity") # default is "bin"
1-2.æéæŧstat=âidentityâ
A.row should be unique: otherwise counts will be summed up
B.missing label will be present at default: differ from stat=âbinâ
C.negative bar is allowed
diamonds[1:5,]
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
ggplot(diamonds[1:5,], aes(x=cut, y=depth)) + geom_bar(stat="identity")
ggplot(diamonds_precounted, aes(x=reorder(Cut, -Freq), y=Freq)) +
geom_bar(stat='identity') # The order is determined by factor levels
1-3.stack grouping
ggplot(data=diamonds, aes(x=color, fill=cut)) + geom_bar() #by fill
ggplot(data=diamonds, aes(x=color, color=cut)) + geom_bar() #by color
ggplot(data=diamonds, aes(x=color, fill=cut)) + geom_bar(position="dodge") #dodge grouping
1-4.From bar to histogram
when x is numeric
ggplot(data=diamonds, aes(x=price)) + geom_bar()
ggplot(data=diamonds, aes(x=price, fill=cut)) + geom_bar(position="stack") #by stack
ggplot(data=diamonds, aes(x=price, fill=cut)) + geom_bar(position="identity", alpha=.5) #by identity (overlapping)
ggplot(data=diamonds, aes(x=price, fill=cut)) + geom_density(position="identity", alpha=.5) #from histogram to density plot
ggplot(data=diamonds[diamonds$cut %in% c("Fair", "Ideal"),], aes(x=price, fill=cut)) +
geom_density(position="identity", alpha=.5) +
geom_bar(position="identity", alpha=.5) #both histogram and density (scale wrong?K)
[įŦŦäē大éĄ] Line Graph
head(WorldPhones) #įŦŦäēįĩdataåēæŦæĻčŋ°
## N.Amer Europe Asia S.Amer Oceania Africa Mid.Amer
## 1951 45939 21574 2876 1815 1646 89 555
## 1956 60423 29990 4708 2568 2366 1411 733
## 1957 64721 32510 5230 2695 2526 1546 773
## 1958 68484 35218 6662 2845 2691 1663 836
## 1959 71799 37598 6856 3000 2868 1769 911
## 1960 76036 40341 8220 3145 3054 1905 1008
str(WorldPhones)
## num [1:7, 1:7] 45939 60423 64721 68484 71799 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:7] "1951" "1956" "1957" "1958" ...
## ..$ : chr [1:7] "N.Amer" "Europe" "Asia" "S.Amer" ...
#Remember: ggplot eat only data.frames
WorldPhones.DF <- as.data.frame(WorldPhones)
WorldPhones.DF$year <- rownames(WorldPhones.DF)
class(WorldPhones.DF) # this time we should be fine!
## [1] "data.frame"
2-1.æ¯čŧä¸ä¸éå
Šį¨Žæšæŗæäģéēŧä¸å
ggplot(WorldPhones.DF, aes(x=year, y=Asia)) + geom_line(size=1.5)
ggplot(WorldPhones.DF, aes(x=as.numeric(year), y=Asia)) + geom_line(size=1.5)
2-2.éˇčĄ¨čå¯Ŧ襨äščŊæ
WP=WorldPhones.DF[,c(1:3,8)]
head(WP,8) #Wide format
## N.Amer Europe Asia year
## 1951 45939 21574 2876 1951
## 1956 60423 29990 4708 1956
## 1957 64721 32510 5230 1957
## 1958 68484 35218 6662 1958
## 1959 71799 37598 6856 1959
## 1960 76036 40341 8220 1960
## 1961 79831 43173 9053 1961
WP.long=melt(WP,id.vars='year') # ?Hmelt?N?????ā´Ģ??Long Format?A?Hid.vars?O?d????
colnames(WP.long)= c('Year','Region','Value')
head(WP.long) #Long format with melt
## Year Region Value
## 1 1951 N.Amer 45939
## 2 1956 N.Amer 60423
## 3 1957 N.Amer 64721
## 4 1958 N.Amer 68484
## 5 1959 N.Amer 71799
## 6 1960 N.Amer 76036
2-3.åį¨ŽLineįä¸å
WP.long$Year <- as.integer(as.character(WP.long$Year))
ggplot(WP.long, aes(x=Year, y=Value, color=Region)) + geom_line(size=1.5)
ggplot(WP.long, aes(x=Year, y=Value, linetype=Region))+
geom_line(size=1.5)
ggplot(WP.long,aes(x=factor(Year), y=Value))+
geom_line(aes(linetype=Region,group=Region),size=1.5)
ggplot(WP.long, aes(x=Year, y=Value, linetype=Region)) + geom_line(size=1.5) +
guides(linetype=guide_legend(reverse=TRUE))
[įŦŦä¸å¤§éĄ] Scatter Plot
head(movies) #įŦŦä¸įĩdataåēæŦæĻčŋ°
## title year length budget rating votes r1 r2 r3
## 1 $ 1971 121 NA 6.4 348 4.5 4.5 4.5
## 2 $1000 a Touchdown 1939 71 NA 6.0 20 0.0 14.5 4.5
## 3 $21 a Day Once a Month 1941 7 NA 8.2 5 0.0 0.0 0.0
## 4 $40,000 1996 70 NA 8.2 6 14.5 0.0 0.0
## 5 $50,000 Climax Show, The 1975 71 NA 3.4 17 24.5 4.5 0.0
## 6 $pent 2000 91 NA 4.3 45 4.5 4.5 4.5
## r4 r5 r6 r7 r8 r9 r10 mpaa Action Animation Comedy Drama
## 1 4.5 14.5 24.5 24.5 14.5 4.5 4.5 0 0 1 1
## 2 24.5 14.5 14.5 14.5 4.5 4.5 14.5 0 0 1 0
## 3 0.0 0.0 24.5 0.0 44.5 24.5 24.5 0 1 0 0
## 4 0.0 0.0 0.0 0.0 0.0 34.5 45.5 0 0 1 0
## 5 14.5 14.5 4.5 0.0 0.0 0.0 24.5 0 0 0 0
## 6 14.5 14.5 14.5 4.5 4.5 14.5 14.5 0 0 0 1
## Documentary Romance Short
## 1 0 0 0
## 2 0 0 0
## 3 0 0 1
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
str(movies)
## Classes 'tbl_df', 'tbl' and 'data.frame': 58788 obs. of 24 variables:
## $ title : chr "$" "$1000 a Touchdown" "$21 a Day Once a Month" "$40,000" ...
## $ year : int 1971 1939 1941 1996 1975 2000 2002 2002 1987 1917 ...
## $ length : int 121 71 7 70 71 91 93 25 97 61 ...
## $ budget : int NA NA NA NA NA NA NA NA NA NA ...
## $ rating : num 6.4 6 8.2 8.2 3.4 4.3 5.3 6.7 6.6 6 ...
## $ votes : int 348 20 5 6 17 45 200 24 18 51 ...
## $ r1 : num 4.5 0 0 14.5 24.5 4.5 4.5 4.5 4.5 4.5 ...
## $ r2 : num 4.5 14.5 0 0 4.5 4.5 0 4.5 4.5 0 ...
## $ r3 : num 4.5 4.5 0 0 0 4.5 4.5 4.5 4.5 4.5 ...
## $ r4 : num 4.5 24.5 0 0 14.5 14.5 4.5 4.5 0 4.5 ...
## $ r5 : num 14.5 14.5 0 0 14.5 14.5 24.5 4.5 0 4.5 ...
## $ r6 : num 24.5 14.5 24.5 0 4.5 14.5 24.5 14.5 0 44.5 ...
## $ r7 : num 24.5 14.5 0 0 0 4.5 14.5 14.5 34.5 14.5 ...
## $ r8 : num 14.5 4.5 44.5 0 0 4.5 4.5 14.5 14.5 4.5 ...
## $ r9 : num 4.5 4.5 24.5 34.5 0 14.5 4.5 4.5 4.5 4.5 ...
## $ r10 : num 4.5 14.5 24.5 45.5 24.5 14.5 14.5 14.5 24.5 4.5 ...
## $ mpaa : chr "" "" "" "" ...
## $ Action : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Animation : int 0 0 1 0 0 0 0 0 0 0 ...
## $ Comedy : int 1 1 0 1 0 0 0 0 0 0 ...
## $ Drama : int 1 0 0 0 0 1 1 0 1 0 ...
## $ Documentary: int 0 0 0 0 0 0 0 1 0 0 ...
## $ Romance : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Short : int 0 0 1 0 0 0 0 1 0 0 ...
movies1 <- movies[!is.na(movies$budget),]
ggplot(movies1, aes(x=budget, y=rating)) + geom_point()
3-1.æ§åļpointįåį¨Žåæ
ggplot(movies1, aes(x=budget, y=rating)) + geom_point(shape=5, size=3)
3-2.Grouping
ggplot(movies1, aes(x=budget, y=rating, color=Action)) + geom_point() #Grouping: by binary variable
ggplot(movies1, aes(x=budget, y=rating, color=factor(Action))) +
geom_point() + labs(color='Action Movie?') #Grouping: by categarical variable
ggplot(movies1, aes(x=budget, y=rating, color=factor(Action), shape=(length > 120))) +
geom_point(size=3) + labs(color='Action Movie?') #Multi-grouping
3-3.regression line
ggplot(movies, aes(x=votes, y=rating)) + geom_point() +
stat_smooth(method=lm, level=.95) # add se=FALSE to disable CI
ggplot(movies, aes(x=votes, y=rating)) + geom_point() + stat_smooth() #The default is a polynomial fit
ggplot(movies1, aes(x=budget, y=rating, color=factor(Action))) +
geom_point() + labs(color='Action Movie?') + stat_smooth(method=lm, se=FALSE) #Fitting is smart to align with grouping
PART 2.åēæŦįĩąč¨éįŽ
2-1-1. Which Type of Film Cost the Most in Average?
movietype <- colnames(movies)[18:24]
movies1_singletype <- movies1[rowSums(movies1[, movietype]) == 1,] # remove multi-typed
movietype_alt <- c(movietype[length(movietype)], movietype[-length(movietype)])
# convert multiple dummies into one factor as grouping var
# a little matrix operation will do the trick
dummies <- as.matrix(movies1_singletype[, movietype_alt])
movies1_singletype$Type <- factor(dummies %*% (1:length(movietype_alt)), labels=movietype_alt)
# Compute the Average Budget of Each Type
tapply(movies1_singletype$budget, movies1_singletype$Type, mean)
## Short Action Animation Comedy Drama Documentary
## 396133.1 32698189.6 32311451.6 11921970.6 10456690.5 729704.8
## Romance
## 5603688.0
2-1-2. Determine the variation
#The first factor level of movietype, Short, is represented as the intercept term.
lmfit <- lm(as.formula("budget ~ Type"), movies1_singletype)
summary(lmfit)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 396133.1 1715935 0.2308556 8.174455e-01
## TypeAction 32302056.5 2062063 15.6649187 7.005190e-53
## TypeAnimation 31915318.5 4157723 7.6761525 2.316874e-14
## TypeComedy 11525837.5 1888686 6.1025705 1.202481e-09
## TypeDrama 10060557.4 1820528 5.5261750 3.604075e-08
## TypeDocumentary 333571.7 2881175 0.1157763 9.078389e-01
## TypeRomance 5207554.9 3713295 1.4024082 1.609149e-01
2-1-3. Another way to estimate the coefficients
#The last predictor, Short is combined into the intercept term.
# mean(movies1_singletype[movies1_singletype$Animation == 1, 'budget'])
lmfit <- lm(as.formula(paste('budget ~', paste(movietype, collapse=' + '))),
movies1_singletype)
summary(lmfit)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 396133.1 1715935 0.2308556 8.174455e-01
## Action 32302056.5 2062063 15.6649187 7.005190e-53
## Animation 31915318.5 4157723 7.6761525 2.316874e-14
## Comedy 11525837.5 1888686 6.1025705 1.202481e-09
## Drama 10060557.4 1820528 5.5261750 3.604075e-08
## Documentary 333571.7 2881175 0.1157763 9.078389e-01
## Romance 5207554.9 3713295 1.4024082 1.609149e-01
2-1-4. Draw the regression lines of each type
#What is the association between cost and rating, conditional on type?
movies1_reg_plot <- ggplot(movies1_singletype, aes(x=budget, y=rating, color=Type)) +
geom_point(shape=1) +
# set fullrange=T will extend the fitted line outside the sample range
stat_smooth(method=lm, se=FALSE, fullrange=FALSE, size=1.5) +
# color is the grouping interface, hence scale_color_*
scale_color_discrete(name='Movie Type: # of samples',
labels=paste(levels(movies1_singletype$Type), ': ',
table(movies1_singletype$Type)))
2-1-5.Output
movies1_reg_plot