## R practicals
## Course on SZTE TTIK
## Files: http://www.inf.u-szeged.hu/~csicsman/oktatas/adatbanyaszat/adatbanyaszat.html
##        http://www.math.bme.hu/~csicsman/oktatas/statprog/statprog.html
## Anna Laszlo
## 2017.05.08, 05.15


## References, further readings: 
## - The R Project for Statistical Computing: http://www.r-project.org/
## - RStudio: http://www.rstudio.com/
## - Help, support, docs from the software and/or internet
## - Coursera - Computing for Data Analysis course at Johns Hopkins by Roger Peng (4 weeks long, started on 23rd of September, 2013)
## - Bob Muenchen: R for SAS and SPSS Users (http://r4stats.com)
## - Quick-R: http://www.statmethods.net/interface/guis.html - great descriptions about methods and examples
## - Peter Dalgaard: Introductory Statistics with R (in Statistics and computing series), 2nd ed. Springer, 2008.
## - Alain F. Zuur, Elena N. Ieno, Erik H.W.G. Meesters: A Beginner's Guide to R (in Use R! series), Springer, 2009.
## - Randall Schumacker, Sara Tomek: Understanding Statistics Using R, Springer, 2013.
## - Reiczigel Jenő, Harnos Andrea, Solymosi Norbert: Biostatisztika nem statisztikusoknak (http://biostatkonyv.hu/) - Biostatistics for not statisticians, Pars Kft. Nagykovácsi, 2014.




############### Practical 1 ###############

##### About R, RStudio
## Originated from S language, which was developed by John Chambers & Bell Labs in 1976
## 1988 - implementation in C (Version 3)
## Whitebook: Statistical models in S
## Greenbook: Programming with data (Version 4)

## R: Implementation of S language
## 1991 New Zealand: Ross Ihaka & Robert Gentleman -> R
## 1993 First announcement
## 1995 GNU (General Public Licence) -> free
## 1996 ICGS paper - R is documented
## 1997 R Core group
## 2000 R version 1
## R is a command line driven program. 
## The user enters commands at the prompt ( > by default ) and each command is executed one at a time. 
## If we run scripts or .r files, we can execute more lines - for those we can use code editors (like RStudio).

## Devided to modular Packages: ~4000 (not controlled by R Core! Developed by users!)
## (For example Bioconductor for genomic analysis is a whole project - one of the biggest ones!)
## in RStudio to install a package: run install.packages() command, or:
## right bottom window -> Packages -> Install Packages ->
## write the name of the required package -> Install (it finds it via CRAN from the web) ->
## then to use an installed package select the library in Package (or run the library() command)
## To install packages we have to be system administrators on the computer

## Drawback!! Objects are stored in physical memory, which can be a limitation!

## RStudio UI structure:
## 3 windows by default (first use): 
## - Console: bottom left                     ## to clear 50 rows from the console: cat(rep("\n",50)). To clear the whole: Ctrl+l, or to do it with a function, find: http://tolstoy.newcastle.edu.au/R/help/06/02/21634.html
## - Workspace/History: upper  right
## - Files/Plots/Packages/Help: bottom left
## File -> New -> R Script: opens a new editor window above the Console -> save: .r file


## Help - using exact command names
help(summary)
?mean
?means                  ## No documentation for 'means' in specified packages and libraries
??means                 ## wider search
demo()                  ## it lists the available demos separated to packages and functions in them
demo(colors)
showCols2()
demo(graphics)
RSiteSearch("means")    ## search via internet

## Freely available: 
## R: http://www.r-project.org/ -> http://cran.rapporter.net/
## RStudio: http://www.rstudio.com/ide/download/desktop

## In RStudio: run a command: Ctrl+Enter
## There are other GUIs for R 
## i.e. TinnR is good for long scripts for debugging - http://www.sciviews.org/Tinn-R/
## or R Commander

## Capital letters matter!!

## unlike SAS or SPSS (which handle each with different systems that follow different rules), 
## R mixes (integrates) the 5 main categories of functions:
## - data input and management
## - statistical analysis and graphical procedures
## - output management (SAS: ODS, SPSS: OMS)
## - macro language: to repeat written commands
## - matrix language: to write new algorithms (SAS: IML, SPSS: Matrix)


##### Import data
## Set working directory: Session -> Set Working Directory -> Choose directory -> select own folder
setwd("C:/Anna/SAS/2016-17-II/gyak/12-13_R/data")

## read data

## http://www.ats.ucla.edu/stat/spss/examples/da/daspss_dl.htm
## Variable A is three levels of treatment, 1 is control, 2 is positive, and 3 is negative.  
## B is a second independent variable and has three levels, 1 = low F & low E, 
## 2 = High F & Low E, 3 = High F & High E. 
## The dependent variable is the memory score.
a <- read.table("Keppel_ch12_memory.txt", header=TRUE)     ## from txt file, where first row contains variable names
View(a)                                                    ## open whole dataset - in RStudio it opens in a nice data browser
head(a, n = 6)                                             ## print the first 6 rows of dataset
names(a)                                                   ## variable names of a dataset
names(a)[1] <- "subject"                                   ## rename variable names
names(a)[2] <- "treatment"
names(a)[3] <- "factor_FE"
names(a)
attach(a)

mean(memory)

## read data from CSV file format
## http://www.ats.ucla.edu/stat/spss/examples/da/daspss_dl.htm
## Variable A has three levels, representing three different instruction techniques, 
## and the outcome is the students comprehension score.
b <- read.csv2("Keppel_ch13_comprehension.csv")        ## ; separated values
## , separated values - read.csv()
attach(b)                                               ## attach variables from dataset to memory - if not, to use variable "age", b$score should be run
str(b)                                                  ## structure of a dataset


## from SPSS
## http://www.ats.ucla.edu/stat/spss/examples/alr2/alr2spssch1.html
## Age and coronary heart disease (CHD) status of 100 subjects. 
library(foreign, pos=4)      ## Package to Read Data Stored by Minitab, S, SAS, SPSS, Stata, Systat, dBase, ...
Dataset <- read.spss("Hosmer_ch1_CHD.sav", use.value.labels=TRUE, max.value.labels=Inf, to.data.frame=TRUE)
colnames(Dataset) <- tolower(colnames(Dataset))         ## to use only lowercase letters for variable names
attach(Dataset)



## Data structure mainly: 
## Rows: subjects, observations, cases (customers, patients, experimental animals, plants, ...)
## Columns: variables - properties, measurements, features, characteristics, attributes (gender, age, body mass, height, blood pressure, salaries, ...)

## Longitudinal data
## Wide or long file format
## To change data structure from long to wide: 
## http://stackoverflow.com/questions/14296918/complex-long-to-wide-data-transformation-with-time-varying-variable
## http://www.statmethods.net/management/sorting.html
dat <- read.table(text = "
                  id    cohort    s    
                  1       1       2
                  1       1       2
                  1       1       1
                  1       1       4
                  2       3       1
                  2       3       1
                  2       3       3
                  3       2       1
                  3       2       2
                  3       2       3
                  3       2       3
                  3       2       4", 
                  header=TRUE)     
datframe <- data.frame(dat, period=sequence(rle(dat$id)$lengths) )    ## period will be a new variable containing timepoints; rle(): Run Length Encoding
widedat <- reshape(datframe, v.names="s", sep="", idvar=c("id", "cohort"), timevar="period", direction="wide")
widedat[is.na(widedat)] = 0                                           ## missing data will be replaced with 0
widedat                                                               ## printing - the same as: print(widedat)
## reshape our wide data format into long:
longdat <- reshape(widedat, direction="long", timevar="period")       ## we got back "dat", just the rows are in a different order
longdat[order(longdat$id),]                                           ## reordering rows



##### Using R the first time
## calculator: it can be used as a normal calculator with well known mathematic properties
5+12
7-9
5*7
8/2
2^4
1+2^3
(1+2)^3

7+9 - (9+7)                 ## commutative
(10+20)+30 - (10+(20+30))   ## associative
(1+2)*3 - (1*3+2*3)         ## distributive



##### Data types
## In R everything is an object
## 5 basic classes: 
## character, numeric (real), integer (with L suffix), complex, logical (True, False)

Inf    ## infinite
NaN    ## undefined value or missing

## sequence
x <- c(1,4,7,2,6,2,3,2,4,5,6,4,6)      ## assignment operator: like "="; c(): concatenate
c <- c("apple","pear","plum"); c       ## with c() we can also concatenate strings
## using ; more commands can be executed in the same line
y <- 3:10
y                                      ## autoprint, the same as print(y)
print(y)                               ## [1] means the first element of a vector

class(x)                               ## data type
z <-as.numeric(x)                      ## conversion
y <-as.logical(x)                      ## it rewrites object y
y
f <- as.factor(x)
f
x2 <- as.numeric(f)
x2
f2 <- c("aa","bb")                     ## a string factor can NOT be converted to numeric!
x2 <- as.numeric(f2)


## vector
x <- vector("numeric", length=8)       ## previous x vector is rewritten: [1] 0 0 0 0 0 0 0 0 0 0
## by default it fills the vector with 0s
x <- rbinom(n=10,size=20,prob=0.5)     ## generating a random vector from a binomial distribution with 10 elements, equal probablities between 0 and 20
x
## indexing
x[1]                   ## simple numeric selector
x[2:3]                 ## sequence selector
x[c(2,6)]              ## vector selector
x[x>10]                ## logical selector


## matrix: vector with 2 dimensions
m <- matrix(1:6, nrow=2, ncol=3)       ## constructed columnwise
m2 <- matrix(1:6,nrow=3,ncol=2)
m %*% m2                               ## matrix multiplication


list(1,"a",TRUE,1+4i,4.78,8L,"Hello World!")  ## it creates 7 vectors: [[4]] - 4th vector


## factor: categorical (discrete) variable: nomenclature
ans <- factor(c("yes","yes","no","yes","no","no","no"))
ans
table(ans)
ans12 <- unclass(ans)
ans12
## set the baseline: the first value of levels is the baseline
ans <- factor(c("yes","yes","no","yes","no","no","no"),levels=c("yes","no"))
ans12 <- unclass(ans)
ans12


## data frames: 
d <- data.frame(x=1:4,y=5:8,z=2:5)      ## datasets are constructed in data frames
x                                       ## x is already defined
d$x                                     ## to use x variable in d data frame, we have to name it exactly





##### Data management

## binding columns and rows
cbind(x,y)
rbind(x,y)

rm(x,y)                                 ## delete objects
x <- rbinom(n=10,size=20,prob=0.5)
x
x <- sort(x)                                 ## sort values ascending in a vector



##### Adding new variables in the dataset
## creating a new variable based on one in the dataset, and add it to the same dataset
a <- transform(a, mem_gr=ifelse(memory>8,1,0))     ## mem_gr will be our new variable



##### Selecting rows or columns from a dataset - Filtration
aa <- a[c(2:5)]                                    ## keep the 2nd-5th variables in dataset "a"
aa <- a[-1]                                        ## drop (exclude) first variable
aa <- a[treatment==1 | treatment==2,c(2:5)]        ## select rows, where treatment is 1 or 2



##### Treating missing values
nrow(aa)                ## check the number of rows in a dataset
ah=na.omit(aa)          ## a is a data frame - delete rows with missing values in any column
attach(ah)

x <- c(1,2,NA,4,NA,5)
bad <- is.na(x)         ## logical vector
x[!bad]

x <- 1:4
y <- c(2:4,NA)
good <- complete.cases(x,y)
x[good]
y[good]





##### Data generation
## http://en.wikibooks.org/wiki/R_Programming/Working_with_data_frames
N <- 100
u <-  rnorm(N)                    ## generate a variable from a random normal distribution
x1 <- rnorm(N)
x2 <- rnorm(N)
y <- 1 + x1 + x2 + u
mydat <- data.frame(y,x1,x2)

xx <- rnorm(1000,mean=75,sd=10)
mean(xx)
hist(xx)
plot(density(xx))

## generating i.e. body mass values with mean=75 and standard deviation=15 (n=250)
## symmetrical distribution
bm <- rnorm(250,75,15)   
## checking distribution visually
hist(bm)                  ## histogram
plot(density(bm))         ## density curve
qqnorm(bm);qqline(bm)     ## QQ-plot


## body mass differences between 2 time points
## distribution is skewed to the right
bmdiff <- rlnorm(250,0,0.3)
hist(bmdiff)
plot(density(bmdiff))
qqnorm(bmdiff);qqline(bmdiff)

## blood sugar levels measured at 2 timepoint
bs1 <- rlnorm(250,2,0.3)
hist(bs1)
plot(density(bs1))

bs2 <- rlnorm(250,2,0.5)
hist(bs2)
plot(density(bs2))

## put together generated variables into a dataset
health_gen_data <- data.frame(bm, bmdiff, bs1, bs2)
## create a CSV file from this dataset
write.csv(health_gen_data, file = "health_gen_data.csv")




##### Data merge
id <- 1:250
df1 <- data.frame(id,bm, bmdiff)
df2 <- data.frame(id,bs1, bs2)
df22 <- df2[order(df2$bs1),] 
## merge two data frames by variable "id"
merge_df <- merge(df1,df22,by="id")


## Adding rows (set of 2 datasets in SPSS)
hgd <- cbind(health_gen_data, id)
set_df <- rbind(hgd[c(5,1:2)],df1)




##### Descriptive statistics
## Distribution of Discrete variables
gender <- factor(c("M","F","M","M","F","M","F","M","M","F"),levels=c("M","F"))
eye_color <- factor(c(1,1,3,2,3,1,3,2,1,3),levels=c(1,2,3))
score <- factor(c(1,2,4,3,4,5,5,3,5,4),levels=c(1,2,3,4,5))
data <- data.frame(gender, eye_color, score)
table(data$gender)                          ## frequency table
table(data$eye_color)
table(data$gender,data$eye_color)           ## contingency table

table(data$score)
cumsum(table(data$score))                   ## cumulative frequencies
prop.table(table(data$score))               ## relative frequencies
cumsum(prop.table(table(data$score)))       ## cumulative relative frequencies

barplot(table(score))                       ## frequency bar chart
barplot(table(score),main="Gyakorisági oszlopdiagram a jegyek megoszlására",
        xlab="Érdemjegyek", ylab="Gyakoriság (fő)", col="blue") 
barplot(cumsum(table(score)))               ## cumulative frequency bar chart
barplot(prop.table(table(score)))           ## relative frequency bar chart
barplot(cumsum(prop.table(table(score))))   ## cumulative relative frequency bar chart

pie(table(score), col=rainbow(5))           ## pie chart


table(b$group)
barplot(table(b$group))
pie(table(b$group))



## Distribution of Continuous variables
x <- c(1,6,3,8,2,5,9,2,3,1,NA,5)
summary(x); "szórás:"; sd(x, na.rm=T); "esetszám hiányzó értékekkel együtt:"; length(x)     ## summary statistics treating missing values in calculating the standard deviation (na.rm=T)
range_x <- max(x,na.rm=T)-min(x,na.rm=T)                      ## range
iqr_x <- quantile(x,0.75,na.rm=T)-quantile(x,0.25,na.rm=T)    ## interquartile range
p05 <- quantile(x,0.05,na.rm=T)                               ## 5% percentile
names(sort(-table(x)))[1]                                     ## the first mode

hist(x)                                   ## histogram
hist(x, col="lightblue", breaks=c(0,4,8,12))     ## histogram with different break points
boxplot(x)                                ## box-whisker plot (box diagram)
plot(density(x, na.rm=T))                 ## density curve
plot(ecdf(x))                             ## distribution function
qqnorm(x);qqline(x)                       ## Normal Quantile-Quantile plot




##### Summary tables
## Aggregating data
gr <- rbinom(250,2,0.5)                   ## generating a grouping variable
hgd <- cbind(hgd,gr)                      ## then adding it to dataset "hgd"
## calculate means of variables in dataset "hgd" into a different dataset (called aggdata)
aggdata <- aggregate(hgd, by=list(gr),  FUN=mean, na.rm=TRUE)     
aggdata

summ <- sapply(hgd,summary)

sd <- apply(hgd, 2 ,sd)       ## in apply 2 means 2nd dimension, which is columns
N <- apply(hgd, 2, length)
rbind(summ,sd,N)


## getting percentiles
library(Hmisc)
describe(hgd) 

## or another type of descriptive stat.
## rows: statistics, columns: variables
install.packages("pastecs")
library(pastecs)
stat.desc(hgd) 

## or the transponate of previous format
install.packages("psych")
library(psych)
describe(hgd)



mean <- tapply(hgd$bm, hgd$gr, mean)
sd <- tapply(hgd$bm, hgd$gr, sd)
N <- tapply(hgd$bm, hgd$gr, length)
rbind(mean,sd,N)




##### Graphics with more details and parameters
## Scatter plot: to check the association between 2 continuous variables
x <- b$score[b$group==1]
y <- b$score[b$group==2]
y <- rnorm(12,4,1)
x <- rnorm(12,6,3)
plot(x,y)
abline(lm(y~x), col="red", lwd=3)   ## linear regression
cor(x,y)                            ## correlation - Pearson correlation coefficient



##### creating a barplot with more attributes, then saving it to PDF
freq <- matrix(c(20,11,29,39,31,45), nrow=3, ncol=2)
percent <- matrix(c(33,18,48,34,27,39), nrow=3, ncol=2)

pdf(file = "barplot_LA_2017-05-08.pdf")           ## open a PDF file where we save the followings until dev.off()

par(mar=c(8,5,4,2))                               ## set margins (bottom, left, top, right)
a <- barplot(
  percent,                                        ## percentage values from a vector
  beside=T,                                       ## vertical bar chart
  axes=T,                                         ## vertical axis is visible
  xlab="",                                        ## label of x axis
  ylab="",                                        ## label of y axis
  names.arg=c("Cases","Controls"),                ## names of grouping variable groups on x axis
  border=c("black"),                              ## color of bar borders
  space=c(0.2, 1.5),                              ## spaces between bars (in case of more factors, more space values can be determined)
  width=0.8,
  xlim=c(1,8),                                    ## scale of x axis
  col=c("darkolivegreen","darkolivegreen3","darkolivegreen1"), ## colors for bars (with less colors than as many bars we have, it repeats)
  main="Oral contraceptive use in cases and controls",         ## title of tha bar chart    
  ylim=c(0,60)                                                 ## scale of y axis     
)
#legend of the graph: cex-letter size, bty-without border, fill-color
legend("topleft", c("Nonusers","Previous users","Current users"), cex=0.9, bty='n',
       fill=c("darkolivegreen","darkolivegreen3","darkolivegreen1"))
text(x = a, y = percent, label = paste(percent,"%"), pos = 3, cex = 0.8, col = "black", font=2)
text(x = a, y = percent, label = freq, pos = 1, cex = 0.6, col = "black")
axis(1, at=a, labels=F, tck=0)                           ## 1: horisontal axis, tck: length of major tick mark
## footnote
mtext(side=1,"Figure 2 Oral contraceptive use for women in percentages (above bars)
      within case and control groups. Frequencies are also shown inside the bars.",
      outer=T, cex=0.8, line=-2.5, font=3, adj=0.5       ## cex: size of letters, line, adj: position, font: font type - 3: Italic
)
## axes labels
mtext(side=2,"Percentages", cex=0.9, line=2.8, adj=0.6)
mtext(side=1,"Oral contraceptive use for women", cex=0.9, line=2.5)

dev.off()                                                ## finish writing the PDF file




##### Mean and standard deviation plot 
## by using a function, which creates error bars
error.bar <- function(x, y, upper, lower=upper, length=0.1,...){
  if(length(x) != length(y) | length(y) !=length(lower) | length(lower) != length(upper))
    stop("vectors must be same length")
  arrows(x,y+upper, x, y-lower, angle=90, code=3, length=length, ...)
}

hgd.means <- apply(health_gen_data[3:4],2,mean)
hgd.sd <- apply(health_gen_data[3:4],2,sd)
hgd.n <- length(health_gen_data$bs1)

bar <- barplot(hgd.means, col="lightgreen", main="Mean and standard error plot",
               ylim=c(0,12), axis.lty=1, xlab="Time", ylab="Mean", xlim=c(0,4))
error.bar(bar,hgd.means, hgd.sd/sqrt(hgd.n))


mean_group <- tapply(b$score, b$group, mean)
sd_group <- tapply(b$score, b$group, sd)
n_group <- tapply(b$score, b$group, length)
bar <- barplot(mean_group, col="orange", main="Mean and standard deviation plot",
               ylim=c(0,20), axis.lty=1, xlab="Group", ylab="Mean", xlim=c(0,4))
error.bar(bar,mean_group, 1.96*sd_group/sqrt(n_group))




##### Bland-Altman plot
BAplot <- function(x,y,yAxisLim=c(-1,1),xlab="Average", ylab="Difference") {
  d <- ((x + y)/2)
  diff <- x - y        
  plot(diff ~ d,pch=16,ylim=yAxisLim,xlab=xlab,ylab=ylab)
  abline(h=mean(diff)-c(-2,0,2)*sd(diff),lty=2)
}

BAplot(health_gen_data$bs2,health_gen_data$bs1,yAxisLim=c(-30,30))




##### Save/export outputs into the Workspace
## Save a plot
png("histogram.png",width = 480, height = 480, units = "px")
hist(health_gen_data$bs2,col="indianred")
dev.off()

## to Write a Text File
write.table(a,"mydata.txt",sep="\t")

## to Write a SAS Export File and a program to read it into SAS
library(foreign)
write.foreign(a, "mydata.txt", "mydata.sas", package="SAS")

## to Write an SPSS Export File and a program to read it into SPSS
library(foreign)
write.foreign(a, "mydata.txt", "mydata.sps", package="SPSS")

## sink() could also be used to send R output to a file - check help


##### useful functions
version                ## gives the version of os -> to check whether we have enough memory
## if we have a dataset with 1,500 tousand rows and 120 columns
## 1,500,000 * 120 * 8 bytes -> 1,34 GB
## with 2 GB RAM it is in the border to handle this data
ls()                   ## list of objects in the memory
dir()                  ## list of datasets in the memory




##### using other built in graphical interface with more functions in menus
## R Commander: it generates the code as well
## the package first has to be installed: 
install.packages("Rcmdr")
library("Rcmdr")            ## in RStudio it offers us to load a list of other packages
## because it uses them - and automatically loads these contrebuted ones







############### Practical 2 ###############

##### Hypothesis testing
library(datasets)
library(help="datasets")    ## list of datasets
View(cars)
str(cars)

##### One-sample t-test
## H0: mu = c       (mu is the population mean, c is a constant)
## HA: mu != c
## Assumption: normally distributed population, independent cases

## variable
var <- cars$speed

## checking normality
summary(var); sd(var,na.rm=T); length(var)
hist(var,col="brown",main="histogram of car speed")       
plot(density(var),main="Density estimate of car speed")    
qqnorm(var); qqline(var)                         
boxplot(var)
shapiro.test(var)

## One-sample t test to check whether this sample could be drawn from
## a population with mu=12 (with a 95% confidence interval).
t.test(var,mu=12,conf.level=0.95) 
## with different mu
t.test(var,mu=14,conf.level=0.95) 




##### Paired t-test
## H0: mu_1 = mu_2 <=> mu_1-mu_2=0 <=> mu_diff=0   (= one-sample t test for differences)
## HA: mu_diff != 0
## Assumption: normally distributed population for the differences,
## paired (related) data: for the same cases having 2 measurements for the same parameter

## data
var_before <- rnorm(50,75,15)
var_after <- var_before + rnorm(50,1,2)

## checking normality
summary(var_after-var_before); sd(var_after-var_before); length(var_after-var_before)
hist(var_after-var_before)
plot(density(var_after-var_before), main="Density curve")
qqnorm(var_after-var_before); qqline(var_after-var_before)
## paired t test
t.test(var_before,var_after,paired=TRUE)
qt(0.025,49)   ## critical value at alpha=0.05, df=49 -> t_alpha = -2.01






##### Independent samples t-test
View(ToothGrowth)
View(sleep)

## H0: mu_1 = mu_2
## HA: mu_1 != mu_2
## Assumption: normally distributed populations in both groups (independent sample groups)
## and equal variances in the 2 populations, the 2 populations have to be independent

var <- ToothGrowth$len          ## continuous variable
group <- ToothGrowth$supp       ## grouping variable, values: VC, OJ
g1 <- c("VC")
g2 <- c("OJ")

## for sleep data:
var <- sleep$extra
group <- sleep$group
g1 <- 1
g2 <- 2

#### checking normality
summary(var[group==g1]); sd(var[group==g1]); length(var[group==g1])
summary(var[group==g2]); sd(var[group==g2]); length(var[group==g2])
boxplot(var~group,col="red",main="Boxplot for var by groups") 
qqnorm(var[group==g1]);qqline(var[group==g1]) 
qqnorm(var[group==g2]);qqline(var[group==g2])
hist(var[group==g1])
hist(var[group==g2])
#### checking equal variances
var.test(var[group==g1],var[group==g2])
#### Independent samples t-test for equal variances
t.test(var~group, var.equal=T)
#### Independent samples t-test for unequal variances
t.test(var~group)










##### Analysis of Variance (ANOVA)
## H0: mu_1 = mu_2 = mu_3
## at least one of the population means is different from the others
## Assumptions: normal distribution in all groups in the populations
## equal variances
## independent groups (samples)
data <- iris
View(data)
View(InsectSprays)
View(Orange)

var <- data$Sepal.Length
group <- data$Species
g1 <- "setosa"
g2 <- "versicolor"
g3 <- "virginica"

## Descriptive statistics
summary(var[group==g1]);sd(var[group==g1]);length(var[group==g1])
summary(var[group==g2]);sd(var[group==g2]);length(var[group==g2])
summary(var[group==g3]);sd(var[group==g3]);length(var[group==g3])
boxplot(var~group)
hist(var[group==g1]);hist(var[group==g2]);hist(var[group==g3])
## test equality of variances for more than 2 groups
bartlett.test(var~group)
## ANOVA
fit<-aov(var~group)         ## Type 1, unbalanced
summary(fit)                ## usual ANOVA table

## when ANOVA has a significant p-value, we can check pairwise comparisons
pairwise.t.test(var,group,p.adj="none")     ## LSD
pairwise.t.test(var,group,p.adj="bon")      ## Bonferroni
TukeyHSD(fit)

aovmod1<-lm(var~group)     ## Type 3
anova(aovmod1)
summary(aovmod1)
TukeyHSD(aovmod1)












##### Chi-square tests, Fisher's Exact test, Odds Ratio, Cohen's Kappa
#View(HairEyeColor)

##### Chi-square tests, Fisher's exact test
## for checking whether 2 discrete variables are independent or not
## just based on their frequencies (contingency table)
## H0: two discrete variables are independent
## HA: two discrete variables are not independent
## Assumption of Chi-square test: at least 80% of expected values frequencies have to be 
## greater than 5 in the contingency table 
x <- matrix(c(40, 22, 18 ,26 ,5, 11), nrow = 2)       ## a 2x3 matrix
x
chisq.test(x)$expected 
chisq.test(x,correct=F)            ## p=0.01222 -> significant at 5% level, because p < alpha=0.05 (we reject H0)

## for data from dataset 
dat <- read.table(text = "
                  group1 group2 score    
                  1       1       2
                  1       1       2
                  1       2       1
                  1       2       4
                  2       2       5
                  2       1       6
                  2       1       1
                  2       2       1
                  2       1       3
                  1       1       1
                  2       1       2
                  1       2       3
                  1       2       3
                  1       2       4", 
                  header=TRUE)     
v1 <- dat$group1
v2 <- dat$group2

o <- table(v1,v2)                ## observed values
o
chisq.test(o,correct=F)                     ## Pearson's Chi-square test: p=0.2801
chisq.test(o)                               ## Pearson's Chi-square test with Yates' continuity correction: p=0.59 -> it resuts a more strict p-value
e <- chisq.test(o)$expected                 ## expected values
e
chi_square <- (o[1,1]-e[1,1])^2/e[1,1] +    ## hand calculation for Pearson's Chi-square teststatistic
  (o[1,2]-e[1,2])^2/e[1,2] +
  (o[2,1]-e[2,1])^2/e[2,1] +
  (o[2,2]-e[2,2])^2/e[2,2]
chi_square                                  ## result of hand calculation
chisq.test(o,correct=F)$statistic           ## result of R using chisq.test command (the same)
chisq.test(o,correct=F)$method              ## which method was used
chisq.test(o)$method


## with Monte Carlo simulation
x  =  matrix(c(2, 12, 8 ,6 ,5, 11), nrow = 2)
chisq.test(x)$expected
chisq.test(x)                                 ## p=0.05468
chisq.test(x,simulate.p.value=TRUE)           ## p=0.06297

chisq.test(v1,v2,simulate.p.value=T)        ## p=0.5987



## Fisher's Exact test
## with no assumptions, but many software does not calculate it, because it is a lot of calculation
fisher.test(x)                                ## p=0.05959

fisher.test(v1,v2)                          ## p=0.5921
fisher.test(o)                              ## the same



## We can also check the odds ratios for 2×2 contingency tables:
## OR=ad/cb = (a/b) / (c/d): 
## ratio of diseased and not diseased when there is a risk factor / ratio of diseased and not diseased when there is NOT a risk factor
## if there is no effect of the risk factor (the exact diagnostic test result), OR=1
## H0: OR=1
## HA: OR!=1
#fisher.test(interesting,necessary)         ## OR=0.008369517, 1/OR=119.4812 -> if it is worth to test, means anything
#table(interesting,necessary)

#fisher.test(gender,interesting)            ## OR=2.280291
#table(gender,interesting)

## hand calculation
## G07 practical sheet, Example 2: Drug Type A and B compared to side effect of drug (yes, no)
a <- 10
b <- 20
c <- 5
d <- 25
m <- matrix(c(a,c,b,d),nrow=2)
m
or <- (a/b)/(c/d)
or                                   ## OR = 2.5
## is OR=2.5 significantly different from 1? (H0: OR=1)
se <- sqrt(1/a + 1/b + 1/c + 1/d)    ## standard error based on the appropriate formula
se                                   ## 0.6244998
low_ci <- exp(1)^(log(or)-1.96*se)   ## lower level of 95% CI
low_ci                               ## 0.7351146
up_ci <- exp(1)^(log(or)+1.96*se)    ## upper level of 95% CI
up_ci                                ## 8.502076
## Decision: as 1 is included in [0.735; 8.502] 95%CI, then we accept H0
## Our odds ratio could be 1 (it is by chance that it is 2.5 in our sample)
## so the proportion of side effect could be equal in Drug A and Drug B.













##### Linear correlation and regression
## checking linear connection between 2 continuous variables
## Symmetric relationship, not a cause-effect relation
## H0: b theoretical=0  or rho=0
## HA: b theoretical!=0 or rho!=0
## Assumptions: residuals are normally distributed

data <- Loblolly
View(Loblolly)
View(EuStockMarkets)
View(faithful)
View(women)

dep = data$age           ## dependent continuous variable
indep = data$height      ## independent continuous variable

plot(indep,dep,ylab='Age',xlab='Height')        ## scatter plot representing the dependencies between 2 continuous variables
## first variable: horisontal axis, second v.: vertical axis
abline(lm(dep~indep), col="red", lwd=3)         ## closest line to the points: regression line: y=a+b×x+h
cor(dep,indep)                                  ## coefficient of linear correlation (Pearson correlation): r=0.9899
## it shows a positive (increasing) strong linear correlation between heights and ages
## r could be between -1 and 1.

hist(lm(dep~indep)$residuals)                   ## assumption: residuals are normally distributed
qqnorm(lm(dep~indep)$residuals);qqline(lm(dep~indep)$residuals)

summary(lm(dep~indep))                           ## linear regression: p<2.2×10^-16 < alpha=0.05 -> significant
## There is a significant positive strong linear correlation between ages and body heights at 5% significance level.

cor.test(dep,indep)                              ## the significance of Pearson correlation
lm(dep~indep)                                    ## equation of regression line