本文所有的例子均来自Analytics Edge的课程。

地图的标记

#install and load two new packages:
install.packages("maps")
install.packages("ggmap")
library(maps)
library(ggmap)

#load a map of Chicago into R:
chicago = get_map(location = "chicago", zoom = 11)

#look at the map
ggmap(chicago)

Image Title

假设我们有一个数据mvt,每一条记录记录了在某个(经度, 纬度)上发生的犯罪的事件,现在想在地图上标出来

#plot the first 100 motor vehicle thefts:
ggmap(chicago) + geom_point(data = mvt[1:100,], aes(x = Longitude, y = Latitude))

Image Title

每个地点发生犯罪的频率不同,我们想凸显这个信息

#round our latitude and longitude to 2 digits of accuracy, and create a crime counts data frame for each area:
LatLonCounts = as.data.frame(table(round(mvt$Longitude,2), round(mvt$Latitude,2)))

# Convert our Longitude and Latitude variable to numbers:
LatLonCounts$Long = as.numeric(as.character(LatLonCounts$Var1))
LatLonCounts$Lat = as.numeric(as.character(LatLonCounts$Var2))

# Plot these points on our map:
ggmap(chicago) + geom_point(data = LatLonCounts, aes(x = Long, y = Lat, color = Freq, size=Freq))

Image Title

#change the color scheme:
ggmap(chicago) + geom_point(data = LatLonCounts, aes(x = Long, y = Lat, color = Freq, size=Freq)) + scale_colour_gradient(low="yellow", high="red")

Image Title

#we can also use the geom_tile geometry
ggmap(chicago) + geom_tile(data = LatLonCounts, aes(x = Long, y = Lat, alpha = Freq), fill="red")

Image Title

下面考虑全美国的犯罪情况

这个是美国地图:

#load the map of the US
statesMap = map_data("state")

#plot the map:
ggplot(statesMap, aes(x = long, y = lat, group = group)) + geom_polygon(fill = "white", color = "black") 

Image Title

下面将地图数据与犯罪数据按州名merge成一个数据,然后画出每周的犯罪率

#plot the number of murder on our map of the United States:
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = Murders)) + geom_polygon(color = "black") + scale_fill_gradient(low = "black", high = "red", guide = "legend")

Image Title

下图是美国各州人口图:

# Plot a map of the population:
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = Population)) + geom_polygon(color = "black") + scale_fill_gradient(low = "black", high = "red", guide = "legend")

Image Title

计算犯罪率:

#create a new variable that is the number of murders per 100,000 population:
murderMap$MurderRate = murderMap$Murders / murderMap$Population * 100000

#redo our plot with murder rate:
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = MurderRate)) + geom_polygon(color = "black") + scale_fill_gradient(low = "black", high = "red", guide = "legend")

Image Title

# Redo the plot, removing any states with murder rates above 10:
ggplot(murderMap, aes(x = long, y = lat, group = group, fill = MurderRate)) + geom_polygon(color = "black") + scale_fill_gradient(low = "black", high = "red", guide = "legend", limits = c(0,10))

Image Title

在地球仪上的应用

这里省略了很多,但这里要强调的是数据最终投影到地球仪上去了(corrd_map(...))

# We can try other projections - this one is visually interesting
ggplot(world_map, aes(x=long, y=lat, group=group)) +
  geom_polygon(aes(fill=Total), color="black") +
  coord_map("ortho", orientation=c(20, 30, 0))

Image Title

折线图

假设我们有一个dataframe叫WeekdayCounts,两个属性为Var1和Freq,其中Var1表示星期几(weekday),Freq表示数量。

#make the "Var1" variable an ORDERED factor variable
#I hate these Chinese, but the system defaults
WeekdayCounts$Var1 = factor(WeekdayCounts$Var1, ordered=TRUE, levels=c("星期日", "星期一", "星期二", "星期三", "星期四", "星期五","星期六"))

#a simple plot
ggplot(WeekdayCounts, aes(x=Var1, y=Freq)) + geom_line(aes(group=1)) + xlab("Day of the Week") + ylab("Total Motor Vehicle Thefts")

我们得到:

a simple plot

假设我们有一个dataframe叫做DayHourCounts,是根据weekday和hour两个属性进行table操作得到的freq。也就是说该dataframe有三个属性,weekday(Var1),hour以及Freq.

#change colors
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1, color=Var1), size=2)

Image Title

#separate the weekends from the weekdays:
DayHourCounts$Type = ifelse((DayHourCounts$Var1 == "Sunday") | (DayHourCounts$Var1 == "Saturday"), "Weekend", "Weekday")

#redo our plot, this time coloring by Type:
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1, color=Type), size=2)

Image Title

#make the lines a little transparent:
ggplot(DayHourCounts, aes(x=Hour, y=Freq)) + geom_line(aes(group=Var1, color=Type), size=2, alpha=0.5) 

Image Title

创建heat map:

#fix the order of the days:
DayHourCounts$Var1 = factor(DayHourCounts$Var1, ordered=TRUE, levels=c("星期一", "星期二", "星期三", "星期四", "星期五","星期六", "星期日"))

#make a heatmap:
ggplot(DayHourCounts, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq))

Image Title

#change the label on the legend, and get rid of the y-label:
ggplot(DayHourCounts, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq)) + scale_fill_gradient(name="Total MV Thefts") + theme(axis.title.y = element_blank())

Image Title

#change the color scheme
ggplot(DayHourCounts, aes(x = Hour, y = Var1)) + geom_tile(aes(fill = Freq)) + scale_fill_gradient(name="Total MV Thefts", low="white", high="red") + theme(axis.title.y = element_blank())

Image Title

Bar charts

假设我们的数据(dataframe)intl有两个属性:region与PercentOfIntl,每一行形式如:(Asia, 53.1).下面要画直方图:

# We want to make a bar plot with region on the X axis
# and Percentage on the y-axis.
ggplot(intl, aes(x=Region, y=PercentOfIntl)) +
  geom_bar(stat="identity") +
  geom_text(aes(label=PercentOfIntl))

Image Title

还可以对上图进行一些微调:

#make Region an ordered factor
# We can do this with the re-order command and transform command. 
intl = transform(intl, Region = reorder(Region, -PercentOfIntl))

#make the percentages out of 100 instead of fractions
intl$PercentOfIntl = intl$PercentOfIntl * 100

#make the plot
ggplot(intl, aes(x=Region, y=PercentOfIntl)) +
geom_bar(stat="identity", fill="dark blue") +
geom_text(aes(label=PercentOfIntl), vjust=-0.4) +
ylab("Percent of International Students") +
theme(axis.title.x = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1))

Image Title

最后一个例子

假设我们有一个dataframe叫做households,结构如下(就是part1里面的那个stacked chart的例子):

> str(households)
'data.frame':   8 obs. of  7 variables:
 $ Year          : int  1970 1980 1990 1995 2000 2005 2010 2012
 $ MarriedWChild : num  40.3 30.9 26.3 25.5 24.1 22.9 20.9 19.6
 $ MarriedWOChild: num  30.3 29.9 29.8 28.9 28.7 28.3 28.8 29.1
 $ OtherFamily   : num  10.6 12.9 14.8 15.6 16 16.7 17.4 17.8
 $ MenAlone      : num  5.6 8.6 9.7 10.2 10.7 11.3 11.9 12.3
 $ WomenAlone    : num  11.5 14 14.9 14.7 14.8 15.3 14.8 15.2
 $ OtherNonfamily: num  1.7 3.6 4.6 5 5.7 5.6 6.2 6.1

现在要改画成折线图:

#first, lets make sure we have ggplot2 loaded
library(ggplot2)

#load reshape2
library(reshape2)

# Plot it
ggplot(melt(households, id="Year"),       
       aes(x=Year, y=value, color=variable)) +
  geom_line(size=2) + geom_point(size=5) +  
  ylab("Percentage of Households")

Image Title

番外

R里面还有好多绘图的工具,都十分有趣。比如:

  1. 使用igraph库可以画出网络结构
    Image Title

  2. 使用wordcloud库可以画出:
    Image Title

  3. 还可以通过画多个直方图展现三维数据:

    ggplot(data = parole, aes(x = time.served, fill = crime)) + geom_histogram(binwidth = 1, position = "identity", alpha = 0.5) + scale_fill_manual(values=colorPalette)
    

    Image Title

总之,在R里面画图非常方便,画出来的效果也非常赞!当然,要学会R的绘图的话,还需要多加实践,记得看相关文档。