library(ggplot2) head(diamonds) str(diamonds) ?diamonds summary(diamonds) # Examining distributions # ====================================================== # Histograms --------------------------------------- qplot(price, data=diamonds, geom="histogram", fill=I("black")) # ALWAYS EXPERIMENT WITH THE BIN SIZE! qplot(price, data=diamonds, geom="histogram", binwidth=500, fill=I("black")) # ALWAYS EXPERIMENT WITH THE BIN SIZE! qplot(price, data=diamonds, geom="histogram", binwidth=100, fill=I("black")) # ALWAYS EXPERIMENT WITH THE BIN SIZE! qplot(price, data=diamonds, geom="histogram", binwidth=50, fill=I("black")) # Zoom in qplot(price, data=diamonds, geom="histogram", binwidth=50, fill=I("black"), xlim=c(0,2500)) # Your turn 1 qplot(carat, data=diamonds, geom="histogram", fill=I("black")) qplot(carat, data=diamonds, geom="histogram", binwidth=0.2, fill=I("black")) qplot(carat, data=diamonds, geom="histogram", binwidth=0.1, fill=I("black")) qplot(carat, data=diamonds, geom="histogram", binwidth=0.05, fill=I("black")) qplot(carat, data=diamonds, geom="histogram", binwidth=0.01, fill=I("black")) # Investigating relationships # ====================================================== # Two continuous variables ----------------------------------- # Use a scatterplot qplot(carat, price, data=diamonds) qplot(log(carat), log(price), data=diamonds) qplot(carat, price/carat, data=diamonds) # Map extra variables to other aesthetic attributes qplot(carat, price, data=diamonds, colour=clarity) # Categorical and continuous qplot(color, price/carat, data=diamonds) qplot(color, price/carat, data=diamonds, geom="jitter") qplot(color, price/carat, data=diamonds, geom="boxplot") qplot(color, price/carat, data=diamonds, geom=c("jitter","boxplot")) qplot(color, price/carat, data=diamonds, geom="jitter", colour=I(alpha("grey60",0.5))) + geom_boxplot(colour=I("black"),fill=I("red")) # Facetting displays the same plot for different subsets of the data # use facets argument, rows on left hand-size qplot(carat, price, data=diamonds, facets = . ~ color) qplot(carat, price, data=diamonds, facets = . ~ color, colour=I(alpha("black",0.1))) qplot(carat, price, data=diamonds, facets = . ~ color, colour=I(alpha("black",0.1))) + geom_smooth(colour=I("red"), method="lm") qplot(log(carat), log(price), data=diamonds, facets = . ~ clarity, colour=I(alpha("black",0.1))) + geom_smooth(colour=I("red"), method="lm") # Or we can use a histogram for each colour to look at the shape of # the distribution in more detail: qplot(price, data=diamonds, facets= color ~ ., geom="histogram") # ALWAYS EXPERIMENT WITH THE BIN SIZE! qplot(price/carat, data=diamonds, facets= color ~ ., geom="histogram", binwidth=100) # Two categorical variables ----------------------------------- # Use a fluctuation diagram - a visualisation of the contingency table qplot(color, cut, data=diamonds, geom="jitter", colour=I(alpha("black", 0.1))) ggfluctuation(table(diamonds$cut, diamonds$color)) ggfluctuation(table(diamonds$cut, diamonds$color), type="colour") # row and column sums: diamonds$cut <- factor(diamonds$cut, c("Fair","Good","Very Good", "Premium","Ideal")) diamonds$clarity <- factor(diamonds$clarity, c("IF", "VVS1", "VVS2", "VS1", "VS2", "SI1", "SI2", "I1")) cut_color <- table(diamonds$cut, diamonds$color) names(dimnames(cut_color)) <- c("cut", "color") ggfluctuation(cut_color) ggfluctuation(cut_color / colSums(cut_color)) ggfluctuation(cut_color / rowSums(cut_color)) # Focus only on best/worst clarity, best/worst color diamonds.sub<-subset(diamonds, clarity=="I1" | clarity=="IF") diamonds.sub<-subset(diamonds.sub, color=="J" | color=="D") dim(diamonds.sub) qplot(log(carat), log(price), data=diamonds.sub, facets = color ~ clarity, colour=I("black")) + geom_smooth(colour=I("red"), method="lm") qplot(log(carat), log(price), data=diamonds.sub, facets = cut ~ clarity+color, colour=I("black")) + geom_smooth(colour=I("red"), method="lm") # Draw multiple lines on the same scatterplot p <- qplot(log(carat), log(price), data=diamonds, geom="point") p + geom_smooth(aes(x=log(carat), y=log(price), colour=color), method="lm") p <- qplot(log(carat), log(price), data=diamonds, geom="point") p + geom_smooth(aes(x=log(carat), y=log(price), colour=clarity), method="lm")