# Prax 2

# Open this text-fail as script in R (R: Fail -> Open Script ...)

# Using the R manuals if necessary

# for example
# 	http://ph.emu.ee/~ktanel/DK_0007/DK_prax21.pdf

# run in R the following commands (Ctrl+R to run active row or selection) 
# and try to understand what they are doing, add own comments if necessary
# (everything after sign # in the row is interpreted as comment).


# ----------------------------------------------------------------------------------------

# Save from the course web-page the dataset studentsR_eng.xls (http://ph.emu.ee/~ktanel/DK_0007/studentsR_eng.xls)
# and open it in MS Excel.

# The dataset contains random sample (n = 100) of the first year veterinary and animal science students from 2003-2005.

# To import the dataset into R, save it in Excel as csv-fail.


# Try to import the saved fail into R showing the exact adress and symbols used in separating columns and decimals
# (if you are not sure, which symbols the Excel was using, open the csv-fail in Notepad and look up)

# The importing command in R is the following:

students=read.csv("C:/Documents and Settings/Tanel/My Documents/DK_0007/studentsR_eng.csv",header=TRUE,sep=";",dec=",")

# In case of problems, you can import the dataset from internet running the command:

students=read.csv("http://ph.emu.ee/~ktanel/DK_0007/studentsR_eng.csv",header=TRUE,sep=";",dec=",")


# The following command shows the list of trait names

names(students)


# The 'attach' command identifies the database 'student' as default database
# Why is this needed?

attach(students)


# -------------------------------------------------------------------------------------

# Run the following commands and try to understand, what they are doing

mean(height)
mean(height[sex=="N"])

by(height,sex,mean)
tapply(height,sex,mean)

summary(height)

# more functions: length, min, max, range, median, sd, var, quantile(height, 0.13)

mean(weight)

# You shoult get a result 'NA'. Problem is in this, that R is very pedantic concerning missing values;
# you must say to R, what to do with missing values - for example remove from analyses:

mean(weight,na.rm=T)  # "not available remove" = true


mean(weight[sex=="N" & bmi>mean(bmi,na.rm=T)], na.rm=T)

mean(na.omit(weight))  # more multiuse possibility to remove missing values

mean(na.omit(weight[sex=="N" & math>3]))

sum(!is.na(weight))

length(na.omit(weight))


# -------------------------------------------------------------------------------------

# Try by itself (these are the excercises from the 1st individaul work):
# -------------
# Calculate the mean and the median of the trait 'headcirc' depending on the math. grade.
# Is porridge eating and non-eating students’ weights’ variability different?


# -------------------------------------------------------------------------------------

table(happy)

barplot(table(happy), main="The frequency of happiness",ylab="No of students",xlab="Happy?",col=heat.colors(3))

barplot(table(porridge), col=c("yellow","orange","red"), ylim=c(0,50))

# By default R overwrites the old diagram with the new, but
# you can order the R to keep also the previous diagrams choosing from the History-menu command <Recording>.
# After this you can move between constructed diagrams using the keys <PageUp> & <PageDown>.

# -------------------------------------------------------------------------------------

table(subj_gr)

# Problem - R interprets the empty cell as the real value,
# to get more correct results, you should say to R that empty cell means missing value.
# One possibility to do this is to add into dataset new correct variable 'subj_gr2'

detach(students)  # to add the new variable, the dataset must be deactivated at first ...

students$subj_gr2=ifelse(students$subj_gr!="O",ifelse(students$subj_gr!="S",ifelse(students$subj_gr!="A",NA,students$subj_gr),students$subj_gr),students$subj_gr)
table(students$subj_gr2)

attach(students)


table(factor(subj_gr2,labels=c("Art","Other","Science")))

subj_grf=factor(subj_gr2,labels=c("Art","Other","Science"))
round(100*prop.table(table(subj_grf)),2)

pie(table(subj_grf))

# What is the difference between following tables?

100*prop.table(table(sex,subj_grf),1)
round(100*prop.table(table(sex,subj_grf),2),1)

addmargins(table(sex,subj_grf))


# -------------------------------------------------------------------------------------

# Try by itself:
# -------------
# How are the students’ math grades distributed – construct the frequency table and the bar plot.
# Are the relative frequencies of math grades different in different specialities  
# 	(LKI=animal science, LAT=veterinary science)?
# Construct the pie plot of sex and write close to the sectors 'Men' and 'Women'.


# -------------------------------------------------------------------------------------

hist(weight,col="red",las=1)

hist(weight,col="gold",las=1,ylab="f(x)",xlab="x",main="Density of weight",xlim=c(30,110),probability=TRUE)
lines(density(na.omit(weight)),lwd=2)

# The following 4 rows should be run as one block (select these rows and press 'Ctrl'+'R')
par(mfrow=c(2,1))  # devides the diagram window into 2 rows and 1 column
hist(weight[sex=="N"],xlim=c(40,100), main="Women", xlab="kaal(kg)", ylab="Frequency")
hist(weight[sex=="M"],xlim=c(40,100), main="Men", xlab="kaal(kg)", ylab="Frequency")
par(mfrow=c(1,1))

# You can construct you own intervals:
weightinterval=cut(weight,c(40,50,60,70,80,90,100))
# insted of exact limits you can also simply say how many intervals you want: ...cut(weight,6)

table(weightinterval)
100*prop.table(table(weightinterval))

table(weightinterval,sex)

Men=table(weightinterval[sex=="M"])
Women=table(weightinterval[sex=="N"])
barplot(rbind(Women,Men),beside=TRUE,legend.text=TRUE,col=c("Green2","Green4"))


boxplot(height, ylim=c(150,195))
boxplot(height~factor(sex), col="skyblue1", names=c("Men","Women"),  ylim=c(150,195), main="Height box-plot")


# More complicated diagrams (run next 8 rows together):

par(mfrow=c(2,1))
hist(height[sex=="M"], col="lightblue", main="Men", ylab="Percentage", xlab="Height", 
las=1, probability=TRUE, ylim=c(0,0.065), xlim=c(150,205))
lines(density(na.omit(height[sex=="M"])))

hist(height[sex=="N"], col="red", main="Women", ylab="Percentage", xlab="Height", 
las=1, probability=TRUE, ylim=c(0,0.065), xlim=c(150,205))
lines(density(na.omit(height[sex=="N"])), lty=2)
par(mfrow=c(1,1))


# -------------------------------------------------------------------------------------

# Try by itself:
# -------------
# Constract the weights' histograms and densities for students eating and not eating the porridge.
# Make a frequency table of students' height devided into 6 intervals.
# Make a box-plots on head circuits for students with different math. grade.


# -------------------------------------------------------------------------------------

# Different tests (try to make a conclusions from each test)


# Is the average height different from 170?

t.test(height,mu=170,conf.level=0.99)

t.test(height[sex=="M"],mu=180,conf.level=0.95)


# Are the head circuits different depending on the math. grade being 3 or 4?

t.test(headcirc[mat==3],headcirc[mat==4]) # test assumes normal distribution in both groups

# the normality is not assumed
wilcox.test(headcirc[mat==3],headcirc[mat==4]) # Wilcoxon aka Mann-Whitney test

ks.test(headcirc[mat==3],headcirc[mat==4]) # Kolmogorov-Smirnov test
# The warning means, that in dataset there are some equal head circuit' values,
# but KS test assumes by default that all values are different.


# Is the variability of head circuits depending on the students' sex?

bartlett.test(headcirc, sex) # Bartlett test - advanced version of traditional F-test used in comparison of variances
# (assumes the normal distribution in compared groups)

# If there is no normal distribution, the Fligner-Killeen test can be used to compare variances:
fligner.test(headcirc, sex)


# Is the head circuit normally distributed?

shapiro.test(headcirc) # Shapiro-Wilk test
# As most of the tests used in controlling the normality, also the SW test is usually to stringent.
# In practical data analysis usually the graphical control of normality is sufficient. For example: 

hist(headcirc)

# As in comparing the groups, the normality is assumed in all groups, then ... (run these 4 lines together):
par(mfrow=c(2,2))
by(headcirc, sex, hist)
by(headcirc, sex, qqnorm)
par(mfrow=c(1,1))

# If there are no missing values, also the Kolmogorov-Smirnov test can be used in normality testing
# (but it is even more stringent than Shapiro-Wilk test):
ks.test(height, pnorm, mean=mean(height), sd=sd(height))


# -------------------------------------------------------------------------------------

# Try by itself:
# -------------
# Find the 95% CI of women's average weight. Is the average weight different from 60 kg (test the hypothesis)?
# Are the average body mass indexes of men and women statistically significantly different?
# Is the body mass index distribution different from the normal distribution?
 
# -------------------------------------------------------------------------------------


# And finally the analysed dataset should be deactivated:
 
detach(students)

# -------------------------------------------------------------------------------------