# Homework0 - Example 2 # Must do: You need to set work directory to the source file location. # You may do this by going to menu Tools->Set Working Directory->To source file location # Utility function for importing data from a csv (comma-separated values, flat table) file import.csv <- function(filename) { return(read.csv(filename, sep = ",", header = TRUE)) } # Utility function for exporting data to csv file # Note that csv files are very portable, practically all tools understand them, # including Microsoft Excel write.csv <- function(ob, filename) { write.table(ob, filename, quote = FALSE, sep = ",", row.names = FALSE) } # Let us load some data set: mpg.csv # This data collects information about various car models and ther fuel efficiency # It is publicly available at University of California at Irvine benchmark # data repository, a popular resource for data miners who want to showcase their # algorithms on data that is well understood and appropriate for public showing # To find more about this data set go to http://archive.ics.uci.edu/ml/datasets/Auto+MPG my.data <- import.csv("mpg.csv") # To examine the contents of the just loaded data set # feel free to type any/all of the following commands in the R Studio console window # (of course skip the hash marks which R considers as comment tags) # head(my.data) # print top 10 rows of data # head(my.data,20) # print top 20 rows of data # tail(my.data) # print the last few rows of data # summary(my.data) # str(my.data) # which variable is of type 'factor'? # Below we will perform various modifications to the data, feel free # to inspect the resultant using trhe functions shown above # by e.g. typing head(x) in your console ############################## # Q1: Slicing data frames ############################## x <- my.data[1:10, ] # first 10 rows of the data frame x <- my.data[,c(1,3,5)] # select columns number 1,3,5 x <- my.data[, -1] # select all but first column x <- my.data$mpg # select column named mpg x <- my.data[, 'mpg'] # select column named mpg ############################## # Q1 Exercise: # goal: create a data frame with all but last column of my.data, print dimensionality of the resulting data frame # hint: use ncol to get the index of the last column, dim() to get dimension information ############################## cat('Q1\n') nf <- ncol(my.data) x <- my.data[, -nf] print(dim(x)) ############################## # Q2: Subsetting data frames ############################## x <- my.data$mpg # extract attribute named 'mpg' high.mpg <- subset(my.data,mpg > median(x)) # extract records for which mpg is above median # as above, but include only three selected attributes in the resulting data frame high.mpg2 <- subset(my.data,mpg > median(x), select = c(cylinders, displacement, horsepower)) # as above, retain all features but 'cylinders' high.mpg3 <- subset(my.data,mpg > median(x), select = -cylinders) ############################## # Q2 Exercise: # goal: create a data frame called x with records of non-US cars with all but 'maker' attribute # and report dimensionality of the resulting data frame # hint: since 'maker' dimension is imported as a factor data type, we need to convert it # to a string type for comparision; we may use as.character() function to accomplish that ############################## cat('Q2\n') x <- subset(my.data,as.character(maker)!='usa',select = -maker) print(dim(x)) ############################## # Q3: function 'which' ############################## # extract the index of records for which maker == 'usa' # function 'with' saves us from having to type 'my.data$maker' x.index <- with(my.data,which(maker=='usa')) x.count <- length(x.index) # count how many elements are in the index x <- length(which(my.data$maker=='usa')) # put together mpg_cat <- with(my.data,ifelse(mpg>mean(mpg),'high','low')) # ifelse is another useful function ############################## # Q3 Exercise: # for variable 'mpg_cat' output the percentage of records with high mpg ############################## cat('Q3\n') x <- length(which(mpg_cat=='high'))/length(mpg_cat) print(x) ############################## # Q4: function 'table' ############################## x <- with(my.data, table(maker)) # frequency count of makers, note that 'maker' is a factor x <- with(my.data, table(modelyear))# fequency count of modelyear, which is an integer x <- with(my.data, table(acceleration)) # frequency count of acceleration which is a float (numeric variable), but does it make practical sense to do this? x <- with(my.data, table(modelyear, maker)) # cross-tabluation of modelyear and maker ############################## # Q4 Exercise: # goals: (a) create a table showing the count of records by maker and by mpg_cat # (b) from the table, output the number of cars made in usa and in low mpg category # hint: first create a data frame: data.frame(mpg_cat,maker), error? remember '$' # hint: table object is like a data frame which can be referenced by row/column names, cf. Q1 above ############################## cat('Q4\n') df <- data.frame(mpg_cat, maker=my.data$maker) tbl <- table(df) print(tbl['low', 'usa']) ############################## # Q5: sorting ############################## x <- my.data[order(my.data$mpg), ] # sort the data frame by mpg from low to high x <- my.data[order(-my.data$mpg), ] # sort the data frame by mpg from high to low # sort the data frame by mpg then by horsepower, in decreasing order x <- with(my.data,my.data[order(-mpg,-horsepower),]) ############################## # Q5: Exercise # goal: tabulate modelyear attribute, sort the resulting table, # output the number of cars from the most populous model year # hints: we will need to convert table into a data frame: data.frame(our_table) # then we need to take a look at the table to decide which attribute to sort on ############################## cat('Q5\n') tbl <- data.frame(table(modelyear=my.data$modelyear)) sorted.tbl <- tbl[order(-tbl$Freq), ] print(sorted.tbl[1, 2]) ############################## # Q6: summarization of data using plyr package and declaring functions ############################## library(plyr) # we need to install package plyr if we have not done so yet # here we demonstrate a group aggregation function implemented in plyr package # it is very similar to the SQL "group by" function # in this example, we compute the mean and standard deviation of mpg for each unique combination # of maker and cylinders x <- ddply(my.data, c('maker','cylinders'), function(df) c(mean=mean(df$mpg),sd=sd(df$mpg))) # it often makes sense to declare a sequence of frequently executed commands as a function # it saves typing time, and neatly organizes structure of our scripts # especially if we end up standarizing non-trivial functionality # in this example, we define a function 'get_cor' that computes linear correlation between mpg and weight # and then we use it to compute correlations for each combination of maker and cylinders get_cor <- function(df){ return(c(cor=cor(df$mpg,df$weight))) } x <- ddply(my.data, c('maker','cylinders'), get_cor) ############################## # Q6 Exercise: # goal: for each combination of maker and modelyear, # compute the ratio of mean(displacement) to mean(horsepower) # output the the max ratio ############################## cat('Q6\n') get_ratio <- function(df){ return(c(ratio=mean(df$displacement)/mean(df$horsepower))) } x <- ddply(my.data, c('maker','modelyear'), get_ratio) print(max(x$ratio))