Chapter 3: Basic Statistical Methods for Climate Data Analysis

Mean, variance, standard deviation, skewness, kurtosis, and quantiles

#R can directly read data from a remote data website
setwd("~/sshen/climmath/data") #set the working directory
dat1 <- read.table("aravg.ann.land_ocean.90S.90N.v4.0.0.2015.txt")
dim(dat1)

## [1] 136   6

tmean15 <- dat1[,2] #Take only the second column of this data matrix
head(tmean15) #The first five values

## [1] -0.367918 -0.317154 -0.317069 -0.393357 -0.457649 -0.468707

#[1] -0.367918 -0.317154 -0.317069 -0.393357 -0.457649 -0.468707
mean(tmean15)

## [1] -0.2034367

#[1] -0.2034367
sd(tmean15)

## [1] 0.3038567

#[1] 0.3038567
var(tmean15)

## [1] 0.09232888

#[1] 0.09232888
library(e1071)

## Warning: package 'e1071' was built under R version 4.1.3

#This R library is needed to compute the following parameters
#install.packages("e1071") #if it is not installed on your computer
skewness(tmean15)

## [1] 0.7141481

#[1] 0.7141481
kurtosis(tmean15)

## [1] -0.3712142

#[1] -0.3712142
median(tmean15)

## [1] -0.29694

#[1] -0.29694
quantile(tmean15,probs = c(0.05,0.25, 0.75, 0.95))

##         5%        25%        75%        95% 
## -0.5792472 -0.4228540 -0.0159035  0.3743795

#        5%        25%        75%        95% 
#-0.5792472 -0.4228540 -0.0159035  0.3743795

Correlation, covariance, and linear trend

#Plot Fig. 3.1
yrtime15 <- seq(1880, 2015)
reg8015 <- lm(tmean15 ~ yrtime15)
# Display regression results
reg8015

## 
## Call:
## lm(formula = tmean15 ~ yrtime15)
## 
## Coefficients:
## (Intercept)     yrtime15  
##  -13.208662     0.006678

par(mar = c(4,4,4,1))
# Plot the temperature time series and its trend line
plot(yrtime15,tmean15,xlab = "Year",ylab = "Temperature [°C]", 
     main = "Global Annual Mean Land and Ocean Surface\nTemperature Anomalies 1880-2015", type = "l", lwd = 2,
     cex.lab = 1.1,cex.axis = 1.1,cex.main = 1.35)
abline(reg8015, col = "red")
text(1932, 0.4, "Linear Temperature Trend 0.67 °C/century", 
     col = "red",cex = 1.1)

#Covariance
x <- yrtime15
y <- tmean15
cov(x, y)

## [1] 10.36856

#Correlation
cor(tmean15, yrtime15)

## [1] 0.8659857

#Verify this result by formulas
x <- yrtime15
y <- tmean15
sx <- sd(yrtime15)
sy <- sd(tmean15)
cxy <- cov(x,y)
rxy <- cxy/(sx*sy)
#The same result as cor(tmean15, yrtime15)
rxy

## [1] 0.8659857

#Verify the linear trend by formulas
rxy*sy/sx #verified

## [1] 0.006677908

cxy/sx^2 #verified

## [1] 0.006677908

Histogram of a set of data

h <- hist(tmean15, main = "Histogram of 1880-2015 Temperature Anomalies",
         xlab = expression(paste("Temperature Anomalies [", degree, "C]")), 
         xlim = c(-1,1), ylim = c(0,50)) 
xfit <- seq(-1,1, length = 100)
areat <- sum((h$counts)*diff(h$breaks[1:2]))#Normalization area
#diff(h$breaks[1:2]) <- h$breaks[2] - h$breaks[1] 
#is the histogram's bin width
yfit <- areat*dnorm(xfit, mean = mean(tmean15), sd = sd(tmean15))
#Plot the normal fit on the histogram
lines(xfit,yfit,col = "blue",lwd = 2,cex.lab = 1.1,cex.axis = 1.1,cex.main = 1.35)

Box plot

boxplot(tmean15, ylim = c(-0.8,0.8), 
        ylab = expression(paste("Temperature Anomalies [", degree, "C]")),
        cex.lab = 1.1)

Plot Fig. 3.4

#Use setwd("working directory") to work in the desired directory
#rm(list = ls())
setwd("~/sshen/climmath/data")
par(mgp = c(1.5,0.5,0))
ust <- read.csv("USJantemp1951-2016-nohead.csv",header = FALSE)
soi <- read.csv("soi-data-nohead.csv", header = FALSE) 
soid <- soi[,2] #Take the second column SOI data
soim <- matrix(soid,ncol = 12,byrow = TRUE) 
#Make the SOI into a matrix with each month as a column
soij <- soim[,1] #Take the first column for Jan SOI
ustj <- ust[,3] #Take the third column: Jan US temp data
setEPS()
# postscript("fig0304.eps", height = 7, width = 7)
par(mar = c(4.5,5,2.5,1), xaxs = "i", yaxs = "i")
plot(soij,ustj,xlim = c(-4,4), ylim = c(-8,8),
     main = "January SOI and the U.S. Temperature Anomalies",
     xlab = "SOI [Dimensionless]", 
     ylab = expression(paste("Temperature Anomalies [", degree, "F]")), 
     pch = 19, cex.lab = 1.1,cex.axis = 1.1,cex.main = 1.35)
#Plot the scatter plot
soiust <- lm(ustj ~ soij) #Linear regression
abline(soiust, col = "blue", lwd = 4) #Linear regression line

# dev.off()

Plot Fig. 3.5

#Q-Q plot for the standardized temperature anomalies
tstand <- (tmean15-mean(tmean15))/sd(tmean15)
set.seed(101)
qn <- rnorm(136) #Simulate 136 points by N(0,1)
qns <- sort(qn)#Sort the points
qq2 <- qqnorm(qns,col = "blue",lwd = 2,cex.lab = 1.1,cex.axis = 1.1,cex.main = 1.35)

setEPS()
# postscript("fig0305.eps", height = 7, width = 7)
par(mar = c(4.5,5,3,1), xaxs = "i", yaxs = "i")
qt <- qqnorm(tstand, 
          main = "Q-Q plot for the Standardized Global Temp.\n Anomalies vs N(0,1)", 
          ylab = "Quantile of Temperature Anomalies", 
          xlab = "Quantile of N(0,1)", xlim = c(-3,3),ylim = c(-3,3),
          cex.lab = 1.1,cex.axis = 1.1,cex.main = 1.35)
qqline(tstand, col = "red", lwd = 3)
points(qq2$x, qq2$y, pch = 19, col = "purple")

# dev.off()

Plot Fig. 3.6

plot.new()
layout(matrix(c(1,2,3), 1, 3, byrow = TRUE), 
       widths = c(3,3,3), heights = c(1,1,1))
lasvegas <- c(0.58,0.42)
sandiego <- c(0.4,0.6)
seattle <- c(0.16,0.84)
names(lasvegas) <- c("Clear","Cloudy")
names(sandiego) <- c("Clear","Cloudy")
names(seattle) <- c("Clear","Cloudy")
barplot(lasvegas,col = c("skyblue","gray"), ylim = c(0,1),
        ylab = "Probability", cex.lab = 1.2)
mtext("Las Vegas", side = 3, line = 1)
barplot(sandiego,col = c("skyblue","gray"), ylim = c(0,1))
mtext("San Diego", side = 3, line = 1)
barplot(seattle,col = c("skyblue","gray"), ylim = c(0,1))
mtext("Seattle", side = 3, line = 1)
mtext(expression(bold("Probability Distribution of Cloudiness")), 
      cex = 1.2,side = 3, line = -1.5, outer = TRUE)

Plot Fig. 3.7

# Density function, probability, and normalization of N(0,1)
#par(mar = c(4.0,4.0,1.5,0.5))
cord.x <- c(-3,seq(-3,3,0.01),-1) 
cord.y <- c(0,dnorm(seq(-3,3,0.01)),0) 
# Make a curve
curve(dnorm(x,0,1), xlim = c(-3,3), lwd = 3,
      main = 'PDF of the Standard Normal Distribution',
      xlab = "Random Variable x",
      ylab = 'Probability Density',
      cex.lab = 1.1,cex.axis = 1.1,cex.main = 1.35) 
# Add the shaded area using many lines
polygon(cord.x,cord.y,col = 'skyblue')
polygon(c(-1.5,-1.5, -1.2, -1.2),c(0, dnorm(-1.5),
                                   dnorm(-1.2), 0.0),col = 'white')
text(0,0.18, "Area = 1", cex = 1.1)
text(-1.65,0.045,"f(x)")
text(-1.35,0.075,"dx")
text(-1.6,0.0125,"x")
text(-0.9,0.0125,"x + dx")
arrows(-2,0.2,-1.35,0.13, length = 0.1)
text(-2,0.21,"dA = f(x)dx")
text(0,0.09,expression(paste(integral(f(x)*dx,- infinity,infinity)," = 1")))

Plot Fig. 3.8

#Normal distribution plot
x <- seq(-8, 8, length = 200)
plot(x,dnorm(x, mean = 0, sd = 1), type = "l", lwd = 4, col = "red",
     ylim = c(0,1),
     xlab = "Random Variable x",
     ylab = "Probability Density",
     main = expression(Normal~Distribution ~ N(mu,sigma^2)))
lines(x,dnorm(x, mean = 0, sd = 2), type = "l", lwd = 2, col = "blue")
lines(x,dnorm(x, mean = 0, sd = 0.5), type = "l", lwd = 2, col = "black")
lines(x,dnorm(x, mean = 3, sd = 1), type = "l", lwd = 2, col = "purple")
lines(x,dnorm(x, mean = -4, sd = 1), type = "l", lwd = 2, col = "green")
# ex.cs1 <- expression(plain(sin) * phi,  paste("cos", phi))
ex.cs1 <- expression(paste(mu, " = 0",~","~ sigma," = 1"),
                     paste(mu, " = 0",~","~ sigma, " = 2"),
                     paste(mu, " = 0",~","~ sigma, " = 1/2"),
                     paste(mu, " = 3",~","~ sigma, " = 1"),
                     paste(mu, " = -4",~","~ sigma, " = 1"))
legend("topleft",legend = ex.cs1, lty = 1, 
       col = c('red','blue','black','purple','green'), cex = 1, bty = 'n')

Calculate probability from a normal distribution

mu <- 0
sig <- 1
intg <- function(x){(1/(sig*sqrt(2*pi)))*exp(-(x-mu)^2/(2*sig^2))}
integrate(intg,-2,2)

## 0.9544997 with absolute error < 1.8e-11

#Or using the R built-in function dnorm to get the same result
integrate(dnorm,-2,2)

## 0.9544997 with absolute error < 1.8e-11

integrate(dnorm,-1.96,1.96)

## 0.9500042 with absolute error < 1e-11

Plot Fig. 3.9

#Plot t-distribution by R
n <- 50
x <- seq(-4, 4, length = 200)
plot(x,dt(x, df = 3), type = "l", lwd = 4, col = "red",
     ylim = c(0,0.6),
     xlab = "Random Variable t",
     ylab = "Probability Density",
     main = "Student t-distribution T(t,df)",
     cex.lab = 1.1,cex.axis = 1.1,cex.main = 1.35)
lines(x,dt(x, df = 1), type = "l", lwd = 2, col = "blue")
lines(x,dt(x, df = 2), type = "l", lwd = 2, col = "black")
lines(x,dt(x, df = 6), type = "l", lwd = 2, col = "purple")
lines(x,dt(x, df = Inf), type = "l", lwd = 2, col = "green")
#ex.cs1 <- expression(plain(sin) * phi,  paste("cos", phi))
ex.cs1 <- c("df = 3", "df = 1","df = 2","df = 6", expression(paste("df = ",infinity)))
legend("topleft",legend = ex.cs1, lty = 1, 
       col = c('red','blue','black','purple','green'), cex = 1, bty = "n")

Confidence interval simulation

mu <- 14 #true mean
sig <- 0.3 #true sd
n <- 50 #sample size
d <- 1.96*sig/sqrt(n)
lowerlim <- mu-d
upperlim <- mu+d
ksim <- 10000 #number of simulations
k <- 0 #k is the simulation counter
xbar <- 1:ksim
for (i in 1:ksim)
{
  xbar[i] <- mean(rnorm(n, mean = mu, sd = sig))
  if (xbar[i] >= lowerlim & xbar[i] <= upperlim)
    k <- k+1
}
print(c(k,ksim),cex.lab = 1.1,cex.axis = 1.1,cex.main = 1.35)

## [1]  9484 10000

Plot the histogram Fig. 3.10

hist(xbar,breaks = 51,
     main = "Histogram of the Simulated\nSample Mean Temperatures",xaxt = "n",
     xlab = expression(paste("Temperature [", degree, "C]")), 
     ylim = c(0,600),
     cex.lab = 1.1,cex.axis = 1.1,cex.main = 1.35)
axis(1,pos = -20, at = c(13.92, mu, 14.08))
text(14,530,"95% Confidence Interval (13.92, 14.08)")

Plot Fig. 3.11

#Plot confidence intervals and tail probabilities
par(mar = c(2.5,3.5,2.0,0.5))
rm(list = ls())
par(mgp = c(1.4,0.5,0))
curve(dnorm(x,0,1), xlim = c(-3,3), lwd = 3,
      main = 'Confidence Intervals and Confidence Levels',
      xlab = "True Mean as a Random Variable", ylab = "",
      xaxt = "n", cex.lab = 1.2) 
title(ylab = 'Probability Density', line = 2, cex.lab = 1.2)
polygon(c(-1.96, seq(-1.96,1.96,len = 100), 1.96),
        c(0,dnorm(seq(-1.96,1.96,len = 100)),0),col = 'skyblue')
polygon(c(-1.0,seq(-1.0, 1, length = 100), 1),
        c(0, dnorm(seq(-1.0, 1, length = 100)), 0.0),col = 'white')
polygon(c(-3.0,seq(-3.0, -1.96, length = 100), -1.96),
        c(0, dnorm(seq(-3.0, -1.96, length = 100)), 0.0),col = 'red')
polygon(c(1.96,seq(1.96, 3.0, length = 100), 3.0),
        c(0, dnorm(seq(1.96, 3.0, length = 100)), 0.0),col = 'red')
points(c(-1,1), c(0,0), pch = 19, col = "blue")
points(0,0, pch = 19)
points(c(-1.96,1.96),c(0,0),pch = 19, col = "red")
text(0,0.02, expression(bar(x)), cex = 1.0)
text(-1.50,0.02, "SE", cex = 1.0)
text(-0.60,0.02, "SE", cex = 1.0)
text(1.50,0.02, "SE", cex = 1.0)
text(0.60,0.02, "SE", cex = 1.0)
text(0,0.2, "Probability\n= 0.68")
arrows(-2.8,0.06,-2.35,0.01, length = 0.1)
text(-2.5,0.09, "Probability\n= 0.025")

Estimate the mean and error bar for a large sample

#Confidence interval for NOAAGlobalTemp 1880-2015
setwd("~/sshen/climmath/data")
dat1 <- read.table("aravg.ann.land_ocean.90S.90N.v4.0.0.2015.txt")
dim(dat1)

## [1] 136   6

tmean15 <- dat1[,2]
MeanEst <- mean(tmean15)
sd1 <- sd(tmean15)
StandErr <- sd1/sqrt(length(tmean15))
ErrorMar <- 1.96*StandErr
MeanEst

## [1] -0.2034367

print(c(MeanEst-ErrorMar, MeanEst+ErrorMar))

## [1] -0.2545055 -0.1523680

Plot Fig. 3.12

# setEPS()
# postscript("fig0312.eps", height = 7, width = 10)
par(mar = c(2.3,3.0,2.0,0.5))
rm(list = ls())
par(mgp = c(1.0,0.5,0))
curve(dnorm(x,0,1), xlim = c(-3,3), lwd = 3,
      main = 'Z-score, p-value, and Significance Level',
      xlab = "z: Standard Normal Random Variable",
      ylab = 'Probability Density',xaxt = "n",yaxt = "n",
      cex.lab = 1.2,cex.lab = 1.1,cex.axis = 1.1,cex.main = 1.35, ylim = c(-0.1,0.4)) 
lines(c(-3,3),c(0,0))
arrows(-3,-0.1,-2.02,-0.1, lwd = 12,col = 'skyblue', length = 0.2, code = 3)
arrows(3,-0.1,-1.90,-0.1, lwd = 12,col = 'green', length = 0.2, code = 3)
polygon(c(-3.0,seq(-3.0, -2.5, length = 100), -2.5),
        c(0, dnorm(seq(-3.0, -2.5, length = 100)), 0.0),col = 'skyblue')
polygon(c(-1.96,seq(-1.96, 3, length = 100), 3),
        c(0, dnorm(seq(-1.96, 3, length = 100)), 0.0),col = 'lightgreen')
points(-1.96,0, pch = 19, col = "red")
points(-2.5,0,pch = 19, col = "skyblue")
text(-1.4,-0.02, expression(z[0.025]~' = -1.96'), cex = 1.1, col = 'red')
text(-2.40,-0.02, "z-score", cex = 1.1, col = 'skyblue')
arrows(-2.8,0.06,-2.6,0.003, length = 0.1)
lines(c(-1.96,-1.96),c(-0.1, .4),lwd = 1.5, col = 'red')
text(-2.5,0.09, "p-value", cex = 1.3) 
text(1.0,-0.06, expression(H[0] ~'region'), cex = 1.1)
text(-2.5,-0.06, expression(H[1] ~'region'), cex = 1.1)
text(0,0.15, expression(H[0] ~'probability 0.975'), cex = 1.1)

# dev.off()

Hypothesis test for NOAAGlobalTemp 2006-2015

setwd("~/sshen/climmath/data")
dat1 <- read.table("aravg.ann.land_ocean.90S.90N.v4.0.0.2015.txt")
tm0615 <- dat1[127:136,2]
MeanEst <- mean(tm0615)
MeanEst

## [1] 0.4107391

sd1 <- sd(tm0615)
sd1

## [1] 0.1023293

n <- 10
t_score <- (MeanEst - 0)/(sd1/sqrt(n))
t_score

## [1] 12.69306

1-pt(t_score, df = n-1)

## [1] 2.383058e-07

qt(1-0.025, df = n-1)

## [1] 2.262157

Hypothesis test for global temp for 1981-1990 and 1991-2000

setwd("~/sshen/climmath/data")
dat1 <- read.table("aravg.ann.land_ocean.90S.90N.v4.0.0.2015.txt")
tm8190 <- dat1[102:111,2]
tm9100 <- dat1[112:121,2]
barT1 <- mean(tm8190)
barT2 <- mean(tm9100)
S1sd <- sd(tm8190)
S2sd <- sd(tm9100)
n1 <- n2 <- 10
Spool <- sqrt(((n1 - 1)*S1sd^2 + (n2 - 1)*S2sd^2)/(n1 + n2 -2))
t <- (barT2 - barT1)/(Spool*sqrt(1/n1 + 1/n2))
tlow <- qt(0.025, df = n1 + n2 -2)
tup <- qt(0.975, df = n1 + n2 -2)
paste("t-score = ", round(t,digits = 5), ",",
      "tlow = ", round(tlow,digits = 5), ",",
      "tup = ", round(tup,digits = 5))

## [1] "t-score =  2.57836 , tlow =  -2.10092 , tup =  2.10092"

pvalue <- 1-pt(t,  df = n1 + n2 -2)
paste( "p-value = ", pvalue)

## [1] "p-value =  0.00947040009284539"

paste("1981-90 temp = ", barT1,",", "1991-00 temp = ",barT2)

## [1] "1981-90 temp =  0.0368621 , 1991-00 temp =  0.1612545"

barT2 - barT1

## [1] 0.1243924

Statistical inference of a linear trend

setwd("~/sshen/climmath/data")
dat1 <- read.table("aravg.ann.land_ocean.90S.90N.v4.0.0.2015.txt")
tm <- dat1[,2]
x <- 1880:2015
summary(lm(tm ~ x))

## 
## Call:
## lm(formula = tm ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.31028 -0.09609 -0.01724  0.11583  0.40289 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.321e+01  6.489e-01  -20.36   <2e-16 ***
## x            6.678e-03  3.331e-04   20.05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1525 on 134 degrees of freedom
## Multiple R-squared:  0.7499, Adjusted R-squared:  0.7481 
## F-statistic: 401.9 on 1 and 134 DF,  p-value: < 2.2e-16

R Code for Climate Mathematics:

Theory and Applications

A Cambridge University Press book By

SSP Shen and RCJ Somerville

Chapter 3: Basic Statistical Methods for Climate Data Analysis

Mean, variance, standard deviation, skewness, kurtosis, and quantiles

Correlation, covariance, and linear trend

Histogram of a set of data

Box plot

Plot Fig. 3.4

Plot Fig. 3.5

Plot Fig. 3.6

Plot Fig. 3.7

Plot Fig. 3.8

Calculate probability from a normal distribution

Plot Fig. 3.9

Confidence interval simulation

Plot the histogram Fig. 3.10

Plot Fig. 3.11

Estimate the mean and error bar for a large sample

Plot Fig. 3.12

Hypothesis test for NOAAGlobalTemp 2006-2015

Hypothesis test for global temp for 1981-1990 and 1991-2000

Statistical inference of a linear trend