# Prax 2 # Open this text-fail as script in R (R: Fail -> Open Script ...) # Using the R manuals if necessary # for example # http://ph.emu.ee/~ktanel/DK_0007/DK_prax21.pdf # run in R the following commands (Ctrl+R to run active row or selection) # and try to understand what they are doing, add own comments if necessary # (everything after sign # in the row is interpreted as comment). # ---------------------------------------------------------------------------------------- # Save from the course web-page the dataset studentsR_eng.xls (http://ph.emu.ee/~ktanel/DK_0007/studentsR_eng.xls) # and open it in MS Excel. # The dataset contains random sample (n = 100) of the first year veterinary and animal science students from 2003-2005. # To import the dataset into R, save it in Excel as csv-fail. # Try to import the saved fail into R showing the exact adress and symbols used in separating columns and decimals # (if you are not sure, which symbols the Excel was using, open the csv-fail in Notepad and look up) # The importing command in R is the following: students=read.csv("C:/Documents and Settings/Tanel/My Documents/DK_0007/studentsR_eng.csv",header=TRUE,sep=";",dec=",") # In case of problems, you can import the dataset from internet running the command: students=read.csv("http://ph.emu.ee/~ktanel/DK_0007/studentsR_eng.csv",header=TRUE,sep=";",dec=",") # The following command shows the list of trait names names(students) # The 'attach' command identifies the database 'student' as default database # Why is this needed? attach(students) # ------------------------------------------------------------------------------------- # Run the following commands and try to understand, what they are doing mean(height) mean(height[sex=="N"]) by(height,sex,mean) tapply(height,sex,mean) summary(height) # more functions: length, min, max, range, median, sd, var, quantile(height, 0.13) mean(weight) # You shoult get a result 'NA'. Problem is in this, that R is very pedantic concerning missing values; # you must say to R, what to do with missing values - for example remove from analyses: mean(weight,na.rm=T) # "not available remove" = true mean(weight[sex=="N" & bmi>mean(bmi,na.rm=T)], na.rm=T) mean(na.omit(weight)) # more multiuse possibility to remove missing values mean(na.omit(weight[sex=="N" & math>3])) sum(!is.na(weight)) length(na.omit(weight)) # ------------------------------------------------------------------------------------- # Try by itself (these are the excercises from the 1st individaul work): # ------------- # Calculate the mean and the median of the trait 'headcirc' depending on the math. grade. # Is porridge eating and non-eating students’ weights’ variability different? # ------------------------------------------------------------------------------------- table(happy) barplot(table(happy), main="The frequency of happiness",ylab="No of students",xlab="Happy?",col=heat.colors(3)) barplot(table(porridge), col=c("yellow","orange","red"), ylim=c(0,50)) # By default R overwrites the old diagram with the new, but # you can order the R to keep also the previous diagrams choosing from the History-menu command . # After this you can move between constructed diagrams using the keys & . # ------------------------------------------------------------------------------------- table(subj_gr) # Problem - R interprets the empty cell as the real value, # to get more correct results, you should say to R that empty cell means missing value. # One possibility to do this is to add into dataset new correct variable 'subj_gr2' detach(students) # to add the new variable, the dataset must be deactivated at first ... students$subj_gr2=ifelse(students$subj_gr!="O",ifelse(students$subj_gr!="S",ifelse(students$subj_gr!="A",NA,students$subj_gr),students$subj_gr),students$subj_gr) table(students$subj_gr2) attach(students) table(factor(subj_gr2,labels=c("Art","Other","Science"))) subj_grf=factor(subj_gr2,labels=c("Art","Other","Science")) round(100*prop.table(table(subj_grf)),2) pie(table(subj_grf)) # What is the difference between following tables? 100*prop.table(table(sex,subj_grf),1) round(100*prop.table(table(sex,subj_grf),2),1) addmargins(table(sex,subj_grf)) # ------------------------------------------------------------------------------------- # Try by itself: # ------------- # How are the students’ math grades distributed – construct the frequency table and the bar plot. # Are the relative frequencies of math grades different in different specialities # (LKI=animal science, LAT=veterinary science)? # Construct the pie plot of sex and write close to the sectors 'Men' and 'Women'. # ------------------------------------------------------------------------------------- hist(weight,col="red",las=1) hist(weight,col="gold",las=1,ylab="f(x)",xlab="x",main="Density of weight",xlim=c(30,110),probability=TRUE) lines(density(na.omit(weight)),lwd=2) # The following 4 rows should be run as one block (select these rows and press 'Ctrl'+'R') par(mfrow=c(2,1)) # devides the diagram window into 2 rows and 1 column hist(weight[sex=="N"],xlim=c(40,100), main="Women", xlab="kaal(kg)", ylab="Frequency") hist(weight[sex=="M"],xlim=c(40,100), main="Men", xlab="kaal(kg)", ylab="Frequency") par(mfrow=c(1,1)) # You can construct you own intervals: weightinterval=cut(weight,c(40,50,60,70,80,90,100)) # insted of exact limits you can also simply say how many intervals you want: ...cut(weight,6) table(weightinterval) 100*prop.table(table(weightinterval)) table(weightinterval,sex) Men=table(weightinterval[sex=="M"]) Women=table(weightinterval[sex=="N"]) barplot(rbind(Women,Men),beside=TRUE,legend.text=TRUE,col=c("Green2","Green4")) boxplot(height, ylim=c(150,195)) boxplot(height~factor(sex), col="skyblue1", names=c("Men","Women"), ylim=c(150,195), main="Height box-plot") # More complicated diagrams (run next 8 rows together): par(mfrow=c(2,1)) hist(height[sex=="M"], col="lightblue", main="Men", ylab="Percentage", xlab="Height", las=1, probability=TRUE, ylim=c(0,0.065), xlim=c(150,205)) lines(density(na.omit(height[sex=="M"]))) hist(height[sex=="N"], col="red", main="Women", ylab="Percentage", xlab="Height", las=1, probability=TRUE, ylim=c(0,0.065), xlim=c(150,205)) lines(density(na.omit(height[sex=="N"])), lty=2) par(mfrow=c(1,1)) # ------------------------------------------------------------------------------------- # Try by itself: # ------------- # Constract the weights' histograms and densities for students eating and not eating the porridge. # Make a frequency table of students' height devided into 6 intervals. # Make a box-plots on head circuits for students with different math. grade. # ------------------------------------------------------------------------------------- # Different tests (try to make a conclusions from each test) # Is the average height different from 170? t.test(height,mu=170,conf.level=0.99) t.test(height[sex=="M"],mu=180,conf.level=0.95) # Are the head circuits different depending on the math. grade being 3 or 4? t.test(headcirc[mat==3],headcirc[mat==4]) # test assumes normal distribution in both groups # the normality is not assumed wilcox.test(headcirc[mat==3],headcirc[mat==4]) # Wilcoxon aka Mann-Whitney test ks.test(headcirc[mat==3],headcirc[mat==4]) # Kolmogorov-Smirnov test # The warning means, that in dataset there are some equal head circuit' values, # but KS test assumes by default that all values are different. # Is the variability of head circuits depending on the students' sex? bartlett.test(headcirc, sex) # Bartlett test - advanced version of traditional F-test used in comparison of variances # (assumes the normal distribution in compared groups) # If there is no normal distribution, the Fligner-Killeen test can be used to compare variances: fligner.test(headcirc, sex) # Is the head circuit normally distributed? shapiro.test(headcirc) # Shapiro-Wilk test # As most of the tests used in controlling the normality, also the SW test is usually to stringent. # In practical data analysis usually the graphical control of normality is sufficient. For example: hist(headcirc) # As in comparing the groups, the normality is assumed in all groups, then ... (run these 4 lines together): par(mfrow=c(2,2)) by(headcirc, sex, hist) by(headcirc, sex, qqnorm) par(mfrow=c(1,1)) # If there are no missing values, also the Kolmogorov-Smirnov test can be used in normality testing # (but it is even more stringent than Shapiro-Wilk test): ks.test(height, pnorm, mean=mean(height), sd=sd(height)) # ------------------------------------------------------------------------------------- # Try by itself: # ------------- # Find the 95% CI of women's average weight. Is the average weight different from 60 kg (test the hypothesis)? # Are the average body mass indexes of men and women statistically significantly different? # Is the body mass index distribution different from the normal distribution? # ------------------------------------------------------------------------------------- # And finally the analysed dataset should be deactivated: detach(students) # -------------------------------------------------------------------------------------