Язык R Отчет (Скворцов) / Анализ данных на языке R
.pdf# step 6 group_by
diamonds <- as_data_frame(diamonds) gr_diamonds <- group_by(diamonds, cut)
sample_n(diamonds, 2) slice(diamonds, 1)
sample_n(gr_diamonds, 2)
slice(gr_diamonds, 1)
# step 7 group_by and summarise ?summarise()
summarise(gr_diamonds, numbers = n(),
mean_price = mean(price), mean_x = mean(x), median_y = median(y), min_y = min(y))
# step 8
gr_diamonds <- group_by(diamonds, cut, color) summarise(gr_diamonds,
numbers = n(), mean_price = mean(price), mean_x = mean(x), median_y = median(y), min_y = min(y))
summarise(gr_diamonds,
numbers = n(), mean_price = mean(price), mean_x = mean(x), median_y = median(y), min_y = min(y),
great_price = sum(price > 5000))
# step 9
gr_mtcars <- group_by(mtcars, am, vs)
my_means <- summarise_all(gr_mtcars, funs(mean))
15. Data.table
# step 1 fread function
library(data.table)
system.time(fread("products.csv")) system.time(read.table("products.csv", header = T, sep = ";"))
products <- fread("products.csv")
# step 2 data.table vs dataframe products[1:10, ]
products[products$price > 10000, ]
with(iris, iris[Species == "virginica", ]) #iris$Species
products[price > 10000]
products[(price > 1000) &
(brand %in% c("Epson", "Apple"))]
# step 3 data filtering
products[available, ] products[available == TRUE, ]
products[3, ] iris[3, ] products[3] iris[3]
products[!(brand %in% c("Apple", "Epson"))]
products[!(1:10)]
# step 4 data transformation
products[, list(name,
price.1k = price / 1000)]
order(products$price, decreasing = T) products[order(price, decreasing = T)] products[order(price, decreasing = T), list(name, price.1k = price / 1000)]
products[order(price, decreasing = T),
list(name, price.1k = paste0(price / 1000, "
тыс.руб"))]
head(products[order(price, decreasing = T), list(name, price.1k = paste0(price /
1000, " тыс.руб"))], 5)
# step 5 data transformation advanced
products[order(price, decreasing = T),
list(price.1k = paste0(price / 1000, " тыс.руб"))]$price.1k
products[, list(name, price)] products[, .(name, price)]
products[, c("name", "price"), with = F]
products[order(-price), .(name = head(name), price = head(price))]
products[, .(price = sum(price))]
a <- products[, list(name.with.brand = paste0(brand, " - ", name))]
a[order(name.with.brand)]
products[, list(name.with.brand = paste0(brand, " - ", name))][order(name.with.brand)]
products[, .(price = { a <- mean(price)
b <- median(price) c(min(price), max(price), a/b)
})]
library(data.table)
products <- fread("products.csv", colClasses = c(price = "double"))