Jared P. Lander
June 13, 2014
require(data.table) diaDT <- as.data.table(diamonds) diaDT[, mean(price), by=cut]
cut V1 1: Ideal 3458 2: Premium 4584 3: Good 3929 4: Very Good 3982 5: Fair 4359
require(dplyr) require(ggplot2) data(diamonds) diamonds %>% group_by(cut) %>% dplyr::summarize(price=mean(price))
Source: local data frame [5 x 2] cut price 1 Fair 4359 2 Good 3929 3 Very Good 3982 4 Premium 4584 5 Ideal 3458
NumericVector agger(DataFrame DF, std::string var, std::string id){ NumericVector numbers = DF[var]; CharacterVector groupers = DF[id]; CharacterVector onlyThese = unique(groupers); NumericVector calcResults(onlyThese.size()); int n = groupers.size(); std::map<SEXP, std::vector<double> > counts; for (int i = 0; i < n; ++i){ counts[groupers[i]].push_back(numbers[i]); } for(int i=0; i<onlyThese.size(); ++i){ calcResults[i] = accumulate(counts[onlyThese[i]].begin(), counts[onlyThese[i]].end(), 0.0) / counts[onlyThese[i]].size(); } calcResults.names() = onlyThese; return calcResults; }
Ideal Very Good Fair Good Premium 3458 3982 4359 3929 4584
require(bigmemory) require(bigalgebra) A <- big.matrix(5, 5, type="double", backingfile="A") A[, ] <- 1:25 (A + A)[, ]
[,1] [,2] [,3] [,4] [,5] [1,] 2 12 22 32 42 [2,] 4 14 24 34 44 [3,] 6 16 26 36 46 [4,] 8 18 28 38 48 [5,] 10 20 30 40 50
require(ffbase) require(biglm) diaFF <- as.ffdf(diamonds) modff1 <- bigglm(price ~ carat + cut, data=diaFF, chunksize=10000) summary(modff1)
Large data regression model: bigglm(price ~ carat + cut, data = diaFF, chunksize = 10000) Sample size = 53940 Coef (95% CI) SE p (Intercept) -2701.4 -2732.2 -2670.5 15.4 0 carat 7871.1 7843.1 7899.0 14.0 0 cut.L 1239.8 1187.6 1292.0 26.1 0 cut.Q -528.6 -574.9 -482.3 23.1 0 cut.C 367.9 327.5 408.3 20.2 0 cut^4 74.6 42.1 107.1 16.2 0
diaDBSource <- src_sqlite(file.path(dataDir, "diamonds.db")) diaDB <- tbl(diaDBSource, "diamonds") diaDB
Source: sqlite 3.7.17 [../../data/diamonds.db] From: diamonds [53,940 x 10] carat cut color clarity depth table price x y z 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05 2.39 .. ... ... ... ... ... ... ... ... ... ...
diaDB %>% group_by(cut) %>% dplyr::summarize(Price=mean(price))
Source: sqlite 3.7.17 [../../data/diamonds.db] From: <derived table> [?? x 2] cut Price 1 Fair 4359 2 Good 3929 3 Ideal 3458 4 Premium 4584 5 Very Good 3982 .. ... ...
require(scidb) scidbconnect(host="localhost") diaSci <- scidb("diamonds") aggregate(diaSci, price ~ cut, "avg(price) as Price")
cut Price 1 Fair 4359 2 Good 3929 3 Very Good 3982 4 Premium 4584 5 Ideal 3458
require(influxdb) influxdb_query('localhost', 8086, 'jaredlander', 'xxxxxxxx', 'stocks', 'SELECT mean(T.Close) FROM att group by time(1w)')
time mean 1 1 32.27 2 2 31.42 3 3 31.13 4 4 30.57 5 5 31.12 6 6 31.14
require(plyrmr) input("/diamonds") %|% group(cut) %|% transmute(Price = mean(price))
cut Price 1 Fair 4359 1.1 Good 3929 1.2 Very Good 3982 1.3 Premium 4584 1.4 Ideal 3458
require(RevoScaleR) diaXDF <- RxXdfData(file.path(dataDir, "diamonds.xdf")) mod1 <- rxLinMod(price ~ carat + cut, data=diaXDF, reportProgress=0) print(mod1, header = FALSE)
Coefficients: price (Intercept) -2074.5 carat 7871.1 cut=Fair -1800.9 cut=Good -680.6 cut=Very Good -290.8 cut=Premium -361.8 cut=Ideal Dropped