R Software Used in Experiments See article: "Big Data Scaling through Metric Mapping: Exploiting the Remarkable Simplicity of Very High Dimensional Spaces using Correspondence Analysis" Presented at IFCS 2015, International Federation of Classification Societies Conference, Bologna, Italy, July 2015. Fionn Murtagh, fmur tagh at acm dot org --------------------------------------------------------------------- Evaluation 1: Dense, Uniformly Distributed Data set.seed(4781) x <- runif(86000000) x100 <- buildmat(x, 86, 100) # See below for the function, buildmat x1000 <- buildmat(x, 86, 1000) x10000 <- buildmat(x, 86, 10000) x100000 <- buildmat(x, 86, 100000) x1000000 <- buildmat(x, 86, 1000000) xtab <- t(z100) # Transpose used for efficiency (eigen-reduction). # PROCESSING (SEE BELOW) xtab <- t(z1000) # PROCESSING xtab <- t(z10000) # PROCESSING xtab <- t(z100000) # PROCESSING xtab <- t(z1000000) # PROCESSING buildmat <- function(sig, numsegs, lenseg) { # Input signal, number of segments, segment length. # From a signal, take successive segments of specified length, # assemble row-wise in a matrix. outmat <- sig[1:lenseg] # Very first segment. finseg <- numsegs - 1 # For final segment. for (i in 1: finseg) { # For 2nd, 3rd, ... final segment.. loind <- (i * lenseg) + 1 hiind <- (i * lenseg) + lenseg nextseg <- sig[loind : hiind] outmat <- rbind(outmat, nextseg) # Build up by row. } outmat } # PROCESSING # THE PROCESSING: CORRESPONDENCE ANALYSIS (EIGEN-REDUCTION, USING eigen) tot <- sum(xtab); fIJ <- xtab/tot; fI <- apply(fIJ, 1, sum); fJ <- apply(fIJ, 2, sum) if (length(fI[fI <= 0]) >= 1) cat("Note and check: fI terms le 0. <-- 1.\n") if (length(fJ[fJ <= 0]) >= 1) cat("Note and check: fJ terms le 0. <-- 1.\n") fI[fI <= 0] <- 1; fJ[fJ <= 0] <- 1; fJsupI <- sweep(fIJ, 1, fI, FUN=''/'') fIsupJ <- sweep(fIJ, 2, fJ, FUN="/"); s <- as.matrix(t(fJsupI)) s1 <- sweep(s, 1, sqrt(fJ), FUN="/"); s2 <- sweep(s1, 2, sqrt(fJ), FUN="/") sres <- eigen(s2,symmetric=T) # Eigenvectors divided row-wise by sqrt(fJ): evectors <- sweep(sres$vectors, 1, sqrt(fJ), FUN="/") # PROJECTIONS ON FACTORS OF ROWS AND COLUMNS rproj <- as.matrix(fJsupI) %*% evectors temp <- as.matrix(s2) %*% sres$vectors cproj <- sweep(sweep(temp,1,sqrt(fJ),FUN="/"),2,sqrt(sres$values),FUN="/") # CONTRIBUTIONS TO FACTORS BY ROWS AND COLUMNS # Contributions: mass times projection distance squared. temp <- sweep( rproj^2, 1, fI, FUN="*") # Normalize such that sum of contributions for a factor equals 1. sumCtrF <- apply(temp, 2, sum) # Note: Obs. x factors. Read cntrs. with factors 1,2,... from cols. 2,3,... rcntr <- sweep(temp, 2, sumCtrF, FUN="/") # ABSOLUTE CONTRIBUTIONS (of v. large row set), THEN REL CONTRIBUTIONS, CTR cat("Abs. contr. mean, sd, med.; rel. contr. - CTR: mean, sd, med.; Max rproj value.\n'') cat("Abs: ", mean(apply(temp,1,sum)), sd(apply(temp,1,sum)), median(apply(temp,1,sum)), "Rel: ", mean(apply(rcntr,1,sum)), sd(apply(rcntr,1,sum)), median(apply(rcntr,1,sum)), "Maxproj: ", max(rproj)) Evaluation 2: Financial Futures # For our financial data, function buildmat defines as follows # loind, hiind, i.e. the indexes of the segments. buildmat <- function(sig, numsegs, lenseg) { # Input signal, number of segments, segment length. # From a signal, take successive segments of specified length, # assemble row-wise in a matrix. outmat <- sig[1:lenseg] # Very first segment. finseg <- numsegs - 1 # For final segment. for (i in 1: finseg) { # For 2nd, 3rd, ... final segment.. loind <- (i * 1000) + 1 # For our finanical data, hiind <- (i * 1000) + lenseg # we alter loind, hiind. nextseg <- sig[loind : hiind] outmat <- rbind(outmat, nextseg) # Build up by row. } outmat } x <- scan("ob5x.csv") # Read 95011 values of financial futures. x100 <- buildmat(x2, 86, 100) x1000 <- buildmat(x2, 86, 1000) x10000 <- buildmat(x2, 86, 10000) # We now have arrays of dimensions, 86 x 100, 1000, 10000 xtab <- t(x100) # PROCESSING (SEE ABOVE) xtab <- t(x1000) # PROCESSING xtab <- t(x10000) # PROCESSING Evaluation 3: Chemicals of Specified Marginal Distribution x <- read.table("inputdata.txt") # 425 x 1052 matrix. 100*sum(x)/(nrow(x)*ncol(x)) # Occupancy found to be 5.9% hist(apply(x,2,sum), xlab="Number of chemicals", ylab="Frequency", nclass=100,main="Column sum histogram") # Figure # Now investigate marginal law of the chemicals. comptes <- hist(apply(x,2,sum),nclass=100)$counts comptes[comptes == 0] <- 1 # Done to allow flexible use of log. length(comptes) # We find 82 counts or histogram bins. plot(log(1:82), log(comptes), xlab="log(presence counts)", ylab="log(frequencies)") title("Log−log plot: number of chemicals per attribute") # Figure # Once beyond 24th bin, there are many 0s. This gives rise to 0 values in the plot. # So we will just fit the regression line to the first 24 bins. abline(lsfit(log(1:24), log(comptes[1:24]))) lsfit(log(1:24), log(comptes[1:24]))$coefficients # Provides the slope. # Just to observe that the marginal distribution, or masses, of the # chemicals are approximately Gaussian. hist(apply(x,1,sum),nclass=20) qqnorm(apply(x,1,sum)) # Now for our long-tailed attributes, the marginal distribution is as follows. xJ <- apply(x, 2, sum) # Here is how we generate data with the same marginal distribution. newnr <- nrow(x); newnc <- 1000*ncol(x) # New dimensions. xnew <- matrix( rep(0,newnr*newnc), nrow=newnr, ncol=newnc) # Initialize to 0. set.seed(4781) # For reproducibility, set seed. randomcols <- floor( 1+ (ncol(x)-1) * runif(floor(newnc/4)) ) # In the foregoing, we generate newnc new column numbers; \in [1,ncol(x)] for (j in 1:length(randomcols)) { # For each col. (The cols. are power law distributed.) xj <- xJ[randomcols[j]] # The marginal sum of that col. randomrows <- floor(1+(newnr-1)*runif(xj)) # Generate nrow(x) row nos. randomly. xnew[randomrows,j] <- 1 } # End. x1000nc <- xnew # New dimensions: 425 x 1052000