## R practicals ## Course on SZTE TTIK ## Files: http://www.inf.u-szeged.hu/~csicsman/oktatas/adatbanyaszat/adatbanyaszat.html ## http://www.math.bme.hu/~csicsman/oktatas/statprog/statprog.html ## Anna Laszlo ## 2017.05.08, 05.15 ## References, further readings: ## - The R Project for Statistical Computing: http://www.r-project.org/ ## - RStudio: http://www.rstudio.com/ ## - Help, support, docs from the software and/or internet ## - Coursera - Computing for Data Analysis course at Johns Hopkins by Roger Peng (4 weeks long, started on 23rd of September, 2013) ## - Bob Muenchen: R for SAS and SPSS Users (http://r4stats.com) ## - Quick-R: http://www.statmethods.net/interface/guis.html - great descriptions about methods and examples ## - Peter Dalgaard: Introductory Statistics with R (in Statistics and computing series), 2nd ed. Springer, 2008. ## - Alain F. Zuur, Elena N. Ieno, Erik H.W.G. Meesters: A Beginner's Guide to R (in Use R! series), Springer, 2009. ## - Randall Schumacker, Sara Tomek: Understanding Statistics Using R, Springer, 2013. ## - Reiczigel Jenő, Harnos Andrea, Solymosi Norbert: Biostatisztika nem statisztikusoknak (http://biostatkonyv.hu/) - Biostatistics for not statisticians, Pars Kft. Nagykovácsi, 2014. ############### Practical 1 ############### ##### About R, RStudio ## Originated from S language, which was developed by John Chambers & Bell Labs in 1976 ## 1988 - implementation in C (Version 3) ## Whitebook: Statistical models in S ## Greenbook: Programming with data (Version 4) ## R: Implementation of S language ## 1991 New Zealand: Ross Ihaka & Robert Gentleman -> R ## 1993 First announcement ## 1995 GNU (General Public Licence) -> free ## 1996 ICGS paper - R is documented ## 1997 R Core group ## 2000 R version 1 ## R is a command line driven program. ## The user enters commands at the prompt ( > by default ) and each command is executed one at a time. ## If we run scripts or .r files, we can execute more lines - for those we can use code editors (like RStudio). ## Devided to modular Packages: ~4000 (not controlled by R Core! Developed by users!) ## (For example Bioconductor for genomic analysis is a whole project - one of the biggest ones!) ## in RStudio to install a package: run install.packages() command, or: ## right bottom window -> Packages -> Install Packages -> ## write the name of the required package -> Install (it finds it via CRAN from the web) -> ## then to use an installed package select the library in Package (or run the library() command) ## To install packages we have to be system administrators on the computer ## Drawback!! Objects are stored in physical memory, which can be a limitation! ## RStudio UI structure: ## 3 windows by default (first use): ## - Console: bottom left ## to clear 50 rows from the console: cat(rep("\n",50)). To clear the whole: Ctrl+l, or to do it with a function, find: http://tolstoy.newcastle.edu.au/R/help/06/02/21634.html ## - Workspace/History: upper right ## - Files/Plots/Packages/Help: bottom left ## File -> New -> R Script: opens a new editor window above the Console -> save: .r file ## Help - using exact command names help(summary) ?mean ?means ## No documentation for 'means' in specified packages and libraries ??means ## wider search demo() ## it lists the available demos separated to packages and functions in them demo(colors) showCols2() demo(graphics) RSiteSearch("means") ## search via internet ## Freely available: ## R: http://www.r-project.org/ -> http://cran.rapporter.net/ ## RStudio: http://www.rstudio.com/ide/download/desktop ## In RStudio: run a command: Ctrl+Enter ## There are other GUIs for R ## i.e. TinnR is good for long scripts for debugging - http://www.sciviews.org/Tinn-R/ ## or R Commander ## Capital letters matter!! ## unlike SAS or SPSS (which handle each with different systems that follow different rules), ## R mixes (integrates) the 5 main categories of functions: ## - data input and management ## - statistical analysis and graphical procedures ## - output management (SAS: ODS, SPSS: OMS) ## - macro language: to repeat written commands ## - matrix language: to write new algorithms (SAS: IML, SPSS: Matrix) ##### Import data ## Set working directory: Session -> Set Working Directory -> Choose directory -> select own folder setwd("C:/Anna/SAS/2016-17-II/gyak/12-13_R/data") ## read data ## http://www.ats.ucla.edu/stat/spss/examples/da/daspss_dl.htm ## Variable A is three levels of treatment, 1 is control, 2 is positive, and 3 is negative. ## B is a second independent variable and has three levels, 1 = low F & low E, ## 2 = High F & Low E, 3 = High F & High E. ## The dependent variable is the memory score. a <- read.table("Keppel_ch12_memory.txt", header=TRUE) ## from txt file, where first row contains variable names View(a) ## open whole dataset - in RStudio it opens in a nice data browser head(a, n = 6) ## print the first 6 rows of dataset names(a) ## variable names of a dataset names(a)[1] <- "subject" ## rename variable names names(a)[2] <- "treatment" names(a)[3] <- "factor_FE" names(a) attach(a) mean(memory) ## read data from CSV file format ## http://www.ats.ucla.edu/stat/spss/examples/da/daspss_dl.htm ## Variable A has three levels, representing three different instruction techniques, ## and the outcome is the students comprehension score. b <- read.csv2("Keppel_ch13_comprehension.csv") ## ; separated values ## , separated values - read.csv() attach(b) ## attach variables from dataset to memory - if not, to use variable "age", b$score should be run str(b) ## structure of a dataset ## from SPSS ## http://www.ats.ucla.edu/stat/spss/examples/alr2/alr2spssch1.html ## Age and coronary heart disease (CHD) status of 100 subjects. library(foreign, pos=4) ## Package to Read Data Stored by Minitab, S, SAS, SPSS, Stata, Systat, dBase, ... Dataset <- read.spss("Hosmer_ch1_CHD.sav", use.value.labels=TRUE, max.value.labels=Inf, to.data.frame=TRUE) colnames(Dataset) <- tolower(colnames(Dataset)) ## to use only lowercase letters for variable names attach(Dataset) ## Data structure mainly: ## Rows: subjects, observations, cases (customers, patients, experimental animals, plants, ...) ## Columns: variables - properties, measurements, features, characteristics, attributes (gender, age, body mass, height, blood pressure, salaries, ...) ## Longitudinal data ## Wide or long file format ## To change data structure from long to wide: ## http://stackoverflow.com/questions/14296918/complex-long-to-wide-data-transformation-with-time-varying-variable ## http://www.statmethods.net/management/sorting.html dat <- read.table(text = " id cohort s 1 1 2 1 1 2 1 1 1 1 1 4 2 3 1 2 3 1 2 3 3 3 2 1 3 2 2 3 2 3 3 2 3 3 2 4", header=TRUE) datframe <- data.frame(dat, period=sequence(rle(dat$id)$lengths) ) ## period will be a new variable containing timepoints; rle(): Run Length Encoding widedat <- reshape(datframe, v.names="s", sep="", idvar=c("id", "cohort"), timevar="period", direction="wide") widedat[is.na(widedat)] = 0 ## missing data will be replaced with 0 widedat ## printing - the same as: print(widedat) ## reshape our wide data format into long: longdat <- reshape(widedat, direction="long", timevar="period") ## we got back "dat", just the rows are in a different order longdat[order(longdat$id),] ## reordering rows ##### Using R the first time ## calculator: it can be used as a normal calculator with well known mathematic properties 5+12 7-9 5*7 8/2 2^4 1+2^3 (1+2)^3 7+9 - (9+7) ## commutative (10+20)+30 - (10+(20+30)) ## associative (1+2)*3 - (1*3+2*3) ## distributive ##### Data types ## In R everything is an object ## 5 basic classes: ## character, numeric (real), integer (with L suffix), complex, logical (True, False) Inf ## infinite NaN ## undefined value or missing ## sequence x <- c(1,4,7,2,6,2,3,2,4,5,6,4,6) ## assignment operator: like "="; c(): concatenate c <- c("apple","pear","plum"); c ## with c() we can also concatenate strings ## using ; more commands can be executed in the same line y <- 3:10 y ## autoprint, the same as print(y) print(y) ## [1] means the first element of a vector class(x) ## data type z <-as.numeric(x) ## conversion y <-as.logical(x) ## it rewrites object y y f <- as.factor(x) f x2 <- as.numeric(f) x2 f2 <- c("aa","bb") ## a string factor can NOT be converted to numeric! x2 <- as.numeric(f2) ## vector x <- vector("numeric", length=8) ## previous x vector is rewritten: [1] 0 0 0 0 0 0 0 0 0 0 ## by default it fills the vector with 0s x <- rbinom(n=10,size=20,prob=0.5) ## generating a random vector from a binomial distribution with 10 elements, equal probablities between 0 and 20 x ## indexing x[1] ## simple numeric selector x[2:3] ## sequence selector x[c(2,6)] ## vector selector x[x>10] ## logical selector ## matrix: vector with 2 dimensions m <- matrix(1:6, nrow=2, ncol=3) ## constructed columnwise m2 <- matrix(1:6,nrow=3,ncol=2) m %*% m2 ## matrix multiplication list(1,"a",TRUE,1+4i,4.78,8L,"Hello World!") ## it creates 7 vectors: [[4]] - 4th vector ## factor: categorical (discrete) variable: nomenclature ans <- factor(c("yes","yes","no","yes","no","no","no")) ans table(ans) ans12 <- unclass(ans) ans12 ## set the baseline: the first value of levels is the baseline ans <- factor(c("yes","yes","no","yes","no","no","no"),levels=c("yes","no")) ans12 <- unclass(ans) ans12 ## data frames: d <- data.frame(x=1:4,y=5:8,z=2:5) ## datasets are constructed in data frames x ## x is already defined d$x ## to use x variable in d data frame, we have to name it exactly ##### Data management ## binding columns and rows cbind(x,y) rbind(x,y) rm(x,y) ## delete objects x <- rbinom(n=10,size=20,prob=0.5) x x <- sort(x) ## sort values ascending in a vector ##### Adding new variables in the dataset ## creating a new variable based on one in the dataset, and add it to the same dataset a <- transform(a, mem_gr=ifelse(memory>8,1,0)) ## mem_gr will be our new variable ##### Selecting rows or columns from a dataset - Filtration aa <- a[c(2:5)] ## keep the 2nd-5th variables in dataset "a" aa <- a[-1] ## drop (exclude) first variable aa <- a[treatment==1 | treatment==2,c(2:5)] ## select rows, where treatment is 1 or 2 ##### Treating missing values nrow(aa) ## check the number of rows in a dataset ah=na.omit(aa) ## a is a data frame - delete rows with missing values in any column attach(ah) x <- c(1,2,NA,4,NA,5) bad <- is.na(x) ## logical vector x[!bad] x <- 1:4 y <- c(2:4,NA) good <- complete.cases(x,y) x[good] y[good] ##### Data generation ## http://en.wikibooks.org/wiki/R_Programming/Working_with_data_frames N <- 100 u <- rnorm(N) ## generate a variable from a random normal distribution x1 <- rnorm(N) x2 <- rnorm(N) y <- 1 + x1 + x2 + u mydat <- data.frame(y,x1,x2) xx <- rnorm(1000,mean=75,sd=10) mean(xx) hist(xx) plot(density(xx)) ## generating i.e. body mass values with mean=75 and standard deviation=15 (n=250) ## symmetrical distribution bm <- rnorm(250,75,15) ## checking distribution visually hist(bm) ## histogram plot(density(bm)) ## density curve qqnorm(bm);qqline(bm) ## QQ-plot ## body mass differences between 2 time points ## distribution is skewed to the right bmdiff <- rlnorm(250,0,0.3) hist(bmdiff) plot(density(bmdiff)) qqnorm(bmdiff);qqline(bmdiff) ## blood sugar levels measured at 2 timepoint bs1 <- rlnorm(250,2,0.3) hist(bs1) plot(density(bs1)) bs2 <- rlnorm(250,2,0.5) hist(bs2) plot(density(bs2)) ## put together generated variables into a dataset health_gen_data <- data.frame(bm, bmdiff, bs1, bs2) ## create a CSV file from this dataset write.csv(health_gen_data, file = "health_gen_data.csv") ##### Data merge id <- 1:250 df1 <- data.frame(id,bm, bmdiff) df2 <- data.frame(id,bs1, bs2) df22 <- df2[order(df2$bs1),] ## merge two data frames by variable "id" merge_df <- merge(df1,df22,by="id") ## Adding rows (set of 2 datasets in SPSS) hgd <- cbind(health_gen_data, id) set_df <- rbind(hgd[c(5,1:2)],df1) ##### Descriptive statistics ## Distribution of Discrete variables gender <- factor(c("M","F","M","M","F","M","F","M","M","F"),levels=c("M","F")) eye_color <- factor(c(1,1,3,2,3,1,3,2,1,3),levels=c(1,2,3)) score <- factor(c(1,2,4,3,4,5,5,3,5,4),levels=c(1,2,3,4,5)) data <- data.frame(gender, eye_color, score) table(data$gender) ## frequency table table(data$eye_color) table(data$gender,data$eye_color) ## contingency table table(data$score) cumsum(table(data$score)) ## cumulative frequencies prop.table(table(data$score)) ## relative frequencies cumsum(prop.table(table(data$score))) ## cumulative relative frequencies barplot(table(score)) ## frequency bar chart barplot(table(score),main="Gyakorisági oszlopdiagram a jegyek megoszlására", xlab="Érdemjegyek", ylab="Gyakoriság (fő)", col="blue") barplot(cumsum(table(score))) ## cumulative frequency bar chart barplot(prop.table(table(score))) ## relative frequency bar chart barplot(cumsum(prop.table(table(score)))) ## cumulative relative frequency bar chart pie(table(score), col=rainbow(5)) ## pie chart table(b$group) barplot(table(b$group)) pie(table(b$group)) ## Distribution of Continuous variables x <- c(1,6,3,8,2,5,9,2,3,1,NA,5) summary(x); "szórás:"; sd(x, na.rm=T); "esetszám hiányzó értékekkel együtt:"; length(x) ## summary statistics treating missing values in calculating the standard deviation (na.rm=T) range_x <- max(x,na.rm=T)-min(x,na.rm=T) ## range iqr_x <- quantile(x,0.75,na.rm=T)-quantile(x,0.25,na.rm=T) ## interquartile range p05 <- quantile(x,0.05,na.rm=T) ## 5% percentile names(sort(-table(x)))[1] ## the first mode hist(x) ## histogram hist(x, col="lightblue", breaks=c(0,4,8,12)) ## histogram with different break points boxplot(x) ## box-whisker plot (box diagram) plot(density(x, na.rm=T)) ## density curve plot(ecdf(x)) ## distribution function qqnorm(x);qqline(x) ## Normal Quantile-Quantile plot ##### Summary tables ## Aggregating data gr <- rbinom(250,2,0.5) ## generating a grouping variable hgd <- cbind(hgd,gr) ## then adding it to dataset "hgd" ## calculate means of variables in dataset "hgd" into a different dataset (called aggdata) aggdata <- aggregate(hgd, by=list(gr), FUN=mean, na.rm=TRUE) aggdata summ <- sapply(hgd,summary) sd <- apply(hgd, 2 ,sd) ## in apply 2 means 2nd dimension, which is columns N <- apply(hgd, 2, length) rbind(summ,sd,N) ## getting percentiles library(Hmisc) describe(hgd) ## or another type of descriptive stat. ## rows: statistics, columns: variables install.packages("pastecs") library(pastecs) stat.desc(hgd) ## or the transponate of previous format install.packages("psych") library(psych) describe(hgd) mean <- tapply(hgd$bm, hgd$gr, mean) sd <- tapply(hgd$bm, hgd$gr, sd) N <- tapply(hgd$bm, hgd$gr, length) rbind(mean,sd,N) ##### Graphics with more details and parameters ## Scatter plot: to check the association between 2 continuous variables x <- b$score[b$group==1] y <- b$score[b$group==2] y <- rnorm(12,4,1) x <- rnorm(12,6,3) plot(x,y) abline(lm(y~x), col="red", lwd=3) ## linear regression cor(x,y) ## correlation - Pearson correlation coefficient ##### creating a barplot with more attributes, then saving it to PDF freq <- matrix(c(20,11,29,39,31,45), nrow=3, ncol=2) percent <- matrix(c(33,18,48,34,27,39), nrow=3, ncol=2) pdf(file = "barplot_LA_2017-05-08.pdf") ## open a PDF file where we save the followings until dev.off() par(mar=c(8,5,4,2)) ## set margins (bottom, left, top, right) a <- barplot( percent, ## percentage values from a vector beside=T, ## vertical bar chart axes=T, ## vertical axis is visible xlab="", ## label of x axis ylab="", ## label of y axis names.arg=c("Cases","Controls"), ## names of grouping variable groups on x axis border=c("black"), ## color of bar borders space=c(0.2, 1.5), ## spaces between bars (in case of more factors, more space values can be determined) width=0.8, xlim=c(1,8), ## scale of x axis col=c("darkolivegreen","darkolivegreen3","darkolivegreen1"), ## colors for bars (with less colors than as many bars we have, it repeats) main="Oral contraceptive use in cases and controls", ## title of tha bar chart ylim=c(0,60) ## scale of y axis ) #legend of the graph: cex-letter size, bty-without border, fill-color legend("topleft", c("Nonusers","Previous users","Current users"), cex=0.9, bty='n', fill=c("darkolivegreen","darkolivegreen3","darkolivegreen1")) text(x = a, y = percent, label = paste(percent,"%"), pos = 3, cex = 0.8, col = "black", font=2) text(x = a, y = percent, label = freq, pos = 1, cex = 0.6, col = "black") axis(1, at=a, labels=F, tck=0) ## 1: horisontal axis, tck: length of major tick mark ## footnote mtext(side=1,"Figure 2 Oral contraceptive use for women in percentages (above bars) within case and control groups. Frequencies are also shown inside the bars.", outer=T, cex=0.8, line=-2.5, font=3, adj=0.5 ## cex: size of letters, line, adj: position, font: font type - 3: Italic ) ## axes labels mtext(side=2,"Percentages", cex=0.9, line=2.8, adj=0.6) mtext(side=1,"Oral contraceptive use for women", cex=0.9, line=2.5) dev.off() ## finish writing the PDF file ##### Mean and standard deviation plot ## by using a function, which creates error bars error.bar <- function(x, y, upper, lower=upper, length=0.1,...){ if(length(x) != length(y) | length(y) !=length(lower) | length(lower) != length(upper)) stop("vectors must be same length") arrows(x,y+upper, x, y-lower, angle=90, code=3, length=length, ...) } hgd.means <- apply(health_gen_data[3:4],2,mean) hgd.sd <- apply(health_gen_data[3:4],2,sd) hgd.n <- length(health_gen_data$bs1) bar <- barplot(hgd.means, col="lightgreen", main="Mean and standard error plot", ylim=c(0,12), axis.lty=1, xlab="Time", ylab="Mean", xlim=c(0,4)) error.bar(bar,hgd.means, hgd.sd/sqrt(hgd.n)) mean_group <- tapply(b$score, b$group, mean) sd_group <- tapply(b$score, b$group, sd) n_group <- tapply(b$score, b$group, length) bar <- barplot(mean_group, col="orange", main="Mean and standard deviation plot", ylim=c(0,20), axis.lty=1, xlab="Group", ylab="Mean", xlim=c(0,4)) error.bar(bar,mean_group, 1.96*sd_group/sqrt(n_group)) ##### Bland-Altman plot BAplot <- function(x,y,yAxisLim=c(-1,1),xlab="Average", ylab="Difference") { d <- ((x + y)/2) diff <- x - y plot(diff ~ d,pch=16,ylim=yAxisLim,xlab=xlab,ylab=ylab) abline(h=mean(diff)-c(-2,0,2)*sd(diff),lty=2) } BAplot(health_gen_data$bs2,health_gen_data$bs1,yAxisLim=c(-30,30)) ##### Save/export outputs into the Workspace ## Save a plot png("histogram.png",width = 480, height = 480, units = "px") hist(health_gen_data$bs2,col="indianred") dev.off() ## to Write a Text File write.table(a,"mydata.txt",sep="\t") ## to Write a SAS Export File and a program to read it into SAS library(foreign) write.foreign(a, "mydata.txt", "mydata.sas", package="SAS") ## to Write an SPSS Export File and a program to read it into SPSS library(foreign) write.foreign(a, "mydata.txt", "mydata.sps", package="SPSS") ## sink() could also be used to send R output to a file - check help ##### useful functions version ## gives the version of os -> to check whether we have enough memory ## if we have a dataset with 1,500 tousand rows and 120 columns ## 1,500,000 * 120 * 8 bytes -> 1,34 GB ## with 2 GB RAM it is in the border to handle this data ls() ## list of objects in the memory dir() ## list of datasets in the memory ##### using other built in graphical interface with more functions in menus ## R Commander: it generates the code as well ## the package first has to be installed: install.packages("Rcmdr") library("Rcmdr") ## in RStudio it offers us to load a list of other packages ## because it uses them - and automatically loads these contrebuted ones ############### Practical 2 ############### ##### Hypothesis testing library(datasets) library(help="datasets") ## list of datasets View(cars) str(cars) ##### One-sample t-test ## H0: mu = c (mu is the population mean, c is a constant) ## HA: mu != c ## Assumption: normally distributed population, independent cases ## variable var <- cars$speed ## checking normality summary(var); sd(var,na.rm=T); length(var) hist(var,col="brown",main="histogram of car speed") plot(density(var),main="Density estimate of car speed") qqnorm(var); qqline(var) boxplot(var) shapiro.test(var) ## One-sample t test to check whether this sample could be drawn from ## a population with mu=12 (with a 95% confidence interval). t.test(var,mu=12,conf.level=0.95) ## with different mu t.test(var,mu=14,conf.level=0.95) ##### Paired t-test ## H0: mu_1 = mu_2 <=> mu_1-mu_2=0 <=> mu_diff=0 (= one-sample t test for differences) ## HA: mu_diff != 0 ## Assumption: normally distributed population for the differences, ## paired (related) data: for the same cases having 2 measurements for the same parameter ## data var_before <- rnorm(50,75,15) var_after <- var_before + rnorm(50,1,2) ## checking normality summary(var_after-var_before); sd(var_after-var_before); length(var_after-var_before) hist(var_after-var_before) plot(density(var_after-var_before), main="Density curve") qqnorm(var_after-var_before); qqline(var_after-var_before) ## paired t test t.test(var_before,var_after,paired=TRUE) qt(0.025,49) ## critical value at alpha=0.05, df=49 -> t_alpha = -2.01 ##### Independent samples t-test View(ToothGrowth) View(sleep) ## H0: mu_1 = mu_2 ## HA: mu_1 != mu_2 ## Assumption: normally distributed populations in both groups (independent sample groups) ## and equal variances in the 2 populations, the 2 populations have to be independent var <- ToothGrowth$len ## continuous variable group <- ToothGrowth$supp ## grouping variable, values: VC, OJ g1 <- c("VC") g2 <- c("OJ") ## for sleep data: var <- sleep$extra group <- sleep$group g1 <- 1 g2 <- 2 #### checking normality summary(var[group==g1]); sd(var[group==g1]); length(var[group==g1]) summary(var[group==g2]); sd(var[group==g2]); length(var[group==g2]) boxplot(var~group,col="red",main="Boxplot for var by groups") qqnorm(var[group==g1]);qqline(var[group==g1]) qqnorm(var[group==g2]);qqline(var[group==g2]) hist(var[group==g1]) hist(var[group==g2]) #### checking equal variances var.test(var[group==g1],var[group==g2]) #### Independent samples t-test for equal variances t.test(var~group, var.equal=T) #### Independent samples t-test for unequal variances t.test(var~group) ##### Analysis of Variance (ANOVA) ## H0: mu_1 = mu_2 = mu_3 ## at least one of the population means is different from the others ## Assumptions: normal distribution in all groups in the populations ## equal variances ## independent groups (samples) data <- iris View(data) View(InsectSprays) View(Orange) var <- data$Sepal.Length group <- data$Species g1 <- "setosa" g2 <- "versicolor" g3 <- "virginica" ## Descriptive statistics summary(var[group==g1]);sd(var[group==g1]);length(var[group==g1]) summary(var[group==g2]);sd(var[group==g2]);length(var[group==g2]) summary(var[group==g3]);sd(var[group==g3]);length(var[group==g3]) boxplot(var~group) hist(var[group==g1]);hist(var[group==g2]);hist(var[group==g3]) ## test equality of variances for more than 2 groups bartlett.test(var~group) ## ANOVA fit<-aov(var~group) ## Type 1, unbalanced summary(fit) ## usual ANOVA table ## when ANOVA has a significant p-value, we can check pairwise comparisons pairwise.t.test(var,group,p.adj="none") ## LSD pairwise.t.test(var,group,p.adj="bon") ## Bonferroni TukeyHSD(fit) aovmod1<-lm(var~group) ## Type 3 anova(aovmod1) summary(aovmod1) TukeyHSD(aovmod1) ##### Chi-square tests, Fisher's Exact test, Odds Ratio, Cohen's Kappa #View(HairEyeColor) ##### Chi-square tests, Fisher's exact test ## for checking whether 2 discrete variables are independent or not ## just based on their frequencies (contingency table) ## H0: two discrete variables are independent ## HA: two discrete variables are not independent ## Assumption of Chi-square test: at least 80% of expected values frequencies have to be ## greater than 5 in the contingency table x <- matrix(c(40, 22, 18 ,26 ,5, 11), nrow = 2) ## a 2x3 matrix x chisq.test(x)$expected chisq.test(x,correct=F) ## p=0.01222 -> significant at 5% level, because p < alpha=0.05 (we reject H0) ## for data from dataset dat <- read.table(text = " group1 group2 score 1 1 2 1 1 2 1 2 1 1 2 4 2 2 5 2 1 6 2 1 1 2 2 1 2 1 3 1 1 1 2 1 2 1 2 3 1 2 3 1 2 4", header=TRUE) v1 <- dat$group1 v2 <- dat$group2 o <- table(v1,v2) ## observed values o chisq.test(o,correct=F) ## Pearson's Chi-square test: p=0.2801 chisq.test(o) ## Pearson's Chi-square test with Yates' continuity correction: p=0.59 -> it resuts a more strict p-value e <- chisq.test(o)$expected ## expected values e chi_square <- (o[1,1]-e[1,1])^2/e[1,1] + ## hand calculation for Pearson's Chi-square teststatistic (o[1,2]-e[1,2])^2/e[1,2] + (o[2,1]-e[2,1])^2/e[2,1] + (o[2,2]-e[2,2])^2/e[2,2] chi_square ## result of hand calculation chisq.test(o,correct=F)$statistic ## result of R using chisq.test command (the same) chisq.test(o,correct=F)$method ## which method was used chisq.test(o)$method ## with Monte Carlo simulation x = matrix(c(2, 12, 8 ,6 ,5, 11), nrow = 2) chisq.test(x)$expected chisq.test(x) ## p=0.05468 chisq.test(x,simulate.p.value=TRUE) ## p=0.06297 chisq.test(v1,v2,simulate.p.value=T) ## p=0.5987 ## Fisher's Exact test ## with no assumptions, but many software does not calculate it, because it is a lot of calculation fisher.test(x) ## p=0.05959 fisher.test(v1,v2) ## p=0.5921 fisher.test(o) ## the same ## We can also check the odds ratios for 2×2 contingency tables: ## OR=ad/cb = (a/b) / (c/d): ## ratio of diseased and not diseased when there is a risk factor / ratio of diseased and not diseased when there is NOT a risk factor ## if there is no effect of the risk factor (the exact diagnostic test result), OR=1 ## H0: OR=1 ## HA: OR!=1 #fisher.test(interesting,necessary) ## OR=0.008369517, 1/OR=119.4812 -> if it is worth to test, means anything #table(interesting,necessary) #fisher.test(gender,interesting) ## OR=2.280291 #table(gender,interesting) ## hand calculation ## G07 practical sheet, Example 2: Drug Type A and B compared to side effect of drug (yes, no) a <- 10 b <- 20 c <- 5 d <- 25 m <- matrix(c(a,c,b,d),nrow=2) m or <- (a/b)/(c/d) or ## OR = 2.5 ## is OR=2.5 significantly different from 1? (H0: OR=1) se <- sqrt(1/a + 1/b + 1/c + 1/d) ## standard error based on the appropriate formula se ## 0.6244998 low_ci <- exp(1)^(log(or)-1.96*se) ## lower level of 95% CI low_ci ## 0.7351146 up_ci <- exp(1)^(log(or)+1.96*se) ## upper level of 95% CI up_ci ## 8.502076 ## Decision: as 1 is included in [0.735; 8.502] 95%CI, then we accept H0 ## Our odds ratio could be 1 (it is by chance that it is 2.5 in our sample) ## so the proportion of side effect could be equal in Drug A and Drug B. ##### Linear correlation and regression ## checking linear connection between 2 continuous variables ## Symmetric relationship, not a cause-effect relation ## H0: b theoretical=0 or rho=0 ## HA: b theoretical!=0 or rho!=0 ## Assumptions: residuals are normally distributed data <- Loblolly View(Loblolly) View(EuStockMarkets) View(faithful) View(women) dep = data$age ## dependent continuous variable indep = data$height ## independent continuous variable plot(indep,dep,ylab='Age',xlab='Height') ## scatter plot representing the dependencies between 2 continuous variables ## first variable: horisontal axis, second v.: vertical axis abline(lm(dep~indep), col="red", lwd=3) ## closest line to the points: regression line: y=a+b×x+h cor(dep,indep) ## coefficient of linear correlation (Pearson correlation): r=0.9899 ## it shows a positive (increasing) strong linear correlation between heights and ages ## r could be between -1 and 1. hist(lm(dep~indep)$residuals) ## assumption: residuals are normally distributed qqnorm(lm(dep~indep)$residuals);qqline(lm(dep~indep)$residuals) summary(lm(dep~indep)) ## linear regression: p<2.2×10^-16 < alpha=0.05 -> significant ## There is a significant positive strong linear correlation between ages and body heights at 5% significance level. cor.test(dep,indep) ## the significance of Pearson correlation lm(dep~indep) ## equation of regression line