AN INTRODUCTION TO R FOR DATA MANIPULATION AND PROCESSING, AND PLOTTING
=======================================================================

IN THE FOLLOWING, WE WILL LOOK AT 

- TYPING IN VALUES, DOING SOME SIMPLE OPERATIONS, AND PLOTTING
- SAVING PLOT OUTPUT AS PDF (FOR YOUR PAPERS) OR AS JPEG (FOR PRESENTATIONS)

- READING A DATA MATRIX IN TO R
- SAVING DATA IN R TO A TEXT FILE
- LOADING A PACKAGE
- MATRIX OPERATIONS
- PLOTTING OPERATIONS, INCLUDING AXIS LABELING 
- PRINCIPAL COMPONENTS ANALYSIS
- HIERARCHICAL CLUSTERING
- IN THE LATTER CASE, USE OF A C PROGRAM IN THE R ENVIRONMENT
- LINEAR AND QUADRATIC DISCRIMINANT ANALYSIS
- READING IN TIME SERIES, AND PLOTTING

COMMENTED LINES IN THIS SCRIPT ARE INDICATED BY "#" AT THE START 

R IS A PUBLICLY AVAILABLE SYSTEM (WITH A LOT OF ONGOING SUPPORT),
AVAILABLE AT: www.r-project.org
(ORIGIN OF THE NAME: THE S SYSTEM CAME OUT OF BELL LABS.  S-PLUS IS 
A SUPPORTED, COMMERCIAL, EXPANDED VERSION OF S.  R IS A PUBLIC VERSION
OF S-PLUS)

CODE IN R, AND IN C (AND FORTRAN) FOR USE IN THE R ENVIRONMENT, IN 
PARTICULAR FOR MULTIVARIATE DATA ANALYSIS, IS AVAILABLE AT:
astro.u-strasbg.fr/~fmurtagh/mda-sw


On Sun Sparc system, I use command Splus5 to run S-Plus.
To create a window for plots, I use command: motif()
On Windows or Mac, I use command R in a command window.
For plot functionality on a Mac, I click on the R icon.

---------------------------------------------------------------------------

# Get R running 
x <- c(2.3, 3.1, 5, 7, 8)
x
sqrt(x)
y <- c(3.1, 1.4, 6.1, 3.4)
y
length(x); length(y)
y <- c(y, 1.1)
y
plot(x,y)
plot(x,y,type="l")
plot(x,y,type="l",xlab="Daily highs",ylab="gm/cm^2")
x <- runif(40)
x
plot(x, type="l")       # Same as: plot(1:40, x, type="l")
plot(x, type="b")
title("First plot")
# File / Save as... for PDF
# Copy / Then in Preview: New from Clipboard / Export ... for JPEG


# Reading in data.  Look at sodata.dat.  We'll read in in two ways.
# Firstly, explicitly saying that each column is numeric, and 
# then putting columns together into array.
# Secondly, reading directly into array.
# Then we will proceed to look at establishing labels (for rows, cols.)
# which are needed later in plots.

so.data <- scan("/Users/fionnmurtagh/Saothar-G4/r-work-sxb/sodata.dat",
  list(a=0,b=0,c=0,d=0,e=0,f=0,g=0,h=0))
# List of elements of type numeric read.
sodata <- cbind(log(so.data$a),so.data$b,so.data$c,so.data$d,
                so.data$e,so.data$f,so.data$g,so.data$h)
# cbind = "column bind" 
# Look at contents of sodata in R by typing variable name: sodata

# Alternative input:
sodata2 <- read.table("/Users/fionnmurtagh/Saothar-G4/r-work-sxb/sodata.dat")

# Note: the file name can be a URL.
# Look at contents: sodata2

# Note difference in handling (row, column) labels.  

# Let's be explicit about the labels.  With command read.table we
# could have had them in the input file.  The attribute of the
# variable containing the labels is called dimnames, and this is 
# of type list.

dimnames(sodata) <- list(NULL,c("log(t_rel)","R_gc","Z_g",
                "log(mass)","c","[Fe/H]","x","x_0"))
dimnames(sodata)[[1]] <- list("M15","M68","M13","M3","M5","M4",
                "47 Tuc","M30","NGC 6397","M92","M12","NGC 6752","M10","M71")

# Now look at sodata again

---------------------------------------------------------------------------

# Principal components analysis using R command princomp.

help(princomp)
sopr <- princomp(sodata, cor=TRUE)
names(sopr)
# sopr is an object with various components.
sopr$loadings
sopr$scores

clbls <- c("log(t_rel)","R_gc","Z_g",
                "log(mass)","c","[Fe/H]","x","x_0")
rlbls <- c("M15","M68","M13","M3","M5","M4",
                "47 Tuc","M30","NGC 6397","M92","M12","NGC 6752",
                "M10","M71")
plot(sopr$scores[,1],sopr$scores[,2],type="n")
# Empty plot to begin with.
plot(sopr$scores[,1],sopr$scores[,2],type="n",xlab="PC1",ylab="PC2")
text(sopr$scores[,1],sopr$scores[,2],rlbls)
title("Principal plane")

# Draw axes using an external script
source("/Users/fionnmurtagh/Saothar-G4/r-work-sxb/plaxes.q")
plaxes(sopr$scores[,1], sopr$scores[,2])
# Save plot as PDF file.  Later can convert to other formats.

---------------------------------------------------------------------------

# Next hierarchical clustering
# Compile to produce object file:
# gcc3 -no-cpp-precomp -c hierclustx.c 
# R CMD SHLIB hierclustx.o
# produces shared object file: hierclustx.so
 
dyn.load("/Users/fionnmurtagh/Saothar-G4/r-work-sxb/hierclustx.so")
source("/Users/fionnmurtagh/Saothar-G4/r-work-sxb/hcluswtd_c.r")
data(iris)
x <- iris[,1:4]
xh <- HierClust(as.matrix(x), rep(1/150, 150))
plclust(xh)
cutree(xh, k=3)

# PCA of iris data
ipc <- princomp(x,cor=TRUE)
# Plot with "+" for each point (iris) in principal plane
plot(ipc$scores[,1], ipc$scores[,2], pch="+")
# How about a different symbol of 1-50, 51-100, 101-150?
plot(ipc$scores[1:50,1], ipc$scores[1:50,2], pch="+")
# Nope, better establish axes in their entirety to start with
plot(ipc$scores[,1], ipc$scores[,2], type="n")
points(ipc$scores[1:50,1], ipc$scores[1:50,2], pch="+")
points(ipc$scores[51:100,1], ipc$scores[51:100,2], pch="-")
points(ipc$scores[101:150,1], ipc$scores[101:150,2], pch="|")


soh <- HierClust(as.matrix(sodata), rep(1/14, 14))
plclust(soh)
sohc <- HierClust(as.matrix(t(sodata)), rep(1/8, 8))
plclust(sohc,labels=clbls)

---------------------------------------------------------------------------

# Discriminant analysis

# We will look at LDA and QDA here.  First, we look at the 
# creation of random samples.  Then we apply them to the 
# Fisher iris data.  


# LDA and QDA (resp., linear/quadratic discriminant analysis) 
# are in package MASS, loaded as follows 
library(MASS)

# Create a sample: 25 values uniformly chosen in 1:50
tr <- sample(1:50, 25)

tr
# Example:
# [1] 50  7 41  3 15 25 33 19 36 43 34 28 30 46 40 35 48 45 47 17 
# 44 11 13 21  9

slct <- c(tr, tr+50, tr+100)
slct
# [1]  50   7  41   3  15  25  33  19  36  43  34  28  30  46  40  
# 35  48  45  47
# [20]  17  44  11  13  21   9 100  57  91  53  65  75  83  69  86  
# 93  84  78  80
# [39]  96  90  85  98  95  97  67  94  61  63  71  59 150 107 141 
# 103 115 125 133
# [58] 119 136 143 134 128 130 146 140 135 148 145 147 117 144 111 
# 113 121 109

ts <- sample(1:50,25)
ts
# [1]  2 26 11 24 23 14 25 45 44 16 47 19 17  7 21 39 32 22  9 13 33 
# 38 37 35  3
selectts <- c(ts, ts+50, ts+100)

iris[1,]
#  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# 1          5.1         3.5          1.4         0.2  setosa

train <- rbind(iris[tr, -5], iris[tr + 50, -5], iris[tr + 100, -5])
xxx <- rbind(iris[slct, -5])       # Identical
yyy <- rbind(iris[slct, 1:4])      # Identical


test <- rbind(iris[selectts, -5])

nrow(train); ncol(train); nrow(test); ncol(test)
# [1] 75
# [1] 4
# [1] 75
# [1] 4

cl <- factor(c(rep("s",25), rep("c",25), rep("v",25)))
cl
# [1] s s s s s s s s s s s s s s s s s s s s s s s s s c c c c c c 
# c c c c c c c c
# [40] c c c c c c c c c c c v v v v v v v v v v v v v v v v v v v v 
# v v v v v
# Levels: c s v

# Carry out LDA on the training set... 

z <- lda(train, cl)
names(z)
# [1] "prior"   "counts"  "means"   "scaling" "ldet"    "lev"     
# "N"       "call"   

# ... and test using the test set.
# Class memberships are given by one of c, s, v.

predict(z, test)$class
# [1] s s s s s s s s s s s s s s s s s s s s s s s s s c c c c c c c 
# c c c c c c c
# [40] v c c c c c c c c c c v v v v v v v v v v v v v v v v v v v v v 
# v v v v
# Levels: c s v

# Carry out QDA on the training set...
# ... and then test on the test set.

z <- qda(train, cl)
predict(z, test)$class
# [1] s s s s s s s s s s s s s s s s s s s s s s s s s c c c c c c c 
# c c c c c c c
# [40] v c c c c c c c c c c v v v v v v v v v v v v v v v v v v v v 
# v v v v v
# Levels: c s v

---------------------------------------------------------------------------

Read in a 1-dimensional time series.

fin <- scan("~/Saothar-G4/r-work-sxb/ftse1326.dat")
plot(1:length(fin), fin, type="l", xlab="Time (days)", ylab="FTSE")

---------------------------------------------------------------------------