############################################################
#                                                          #
#         Virtual Laboratory of Statistics in Python       #
#                                                          #
#  Inferencial statistics with two population (10.01.2017) #
#                                                          #                
#         Complutense University of Madrid, Spain          #
#                                                          #
#   THIS SCRIPT IS PROVIDED BY THE AUTHORS "AS IS" AND     #
#   CAN BE USED BY ANYONE FOR THE PURPOSES OF EDUCATION    #
#   AND RESEARCH.                                          #
#                                                          #
############################################################
import math
import numpy as np 
import scipy.stats as s 
import statistics as ss 
import matplotlib.pyplot as plt 
import pylab

# TWO POPULATIONS

# Declare here the name of the data file
col1, col2 = np.loadtxt('datafile.dat', unpack=True)

# Summary statistics
print('           Grupo 1       Grupo 2           ')
print('================================')
print('n             =',len(col1),'      ',len(col2))
print('Minimum       = %.2f' % min(col1),'     %.2f' %min(col2))
print('Maximum       = %.2f' % max(col1),'     %.2f' % max(col2))
print('Rank          = %.2f' % (max(col1)-min(col1)),'     %.2f' % (max(col2)-min(col2)))
print('Average       = %.2f' % ss.mean(col1),'     %.2f' % ss.mean(col2))
print('Median        = %.2f' % ss.median(col1),'     %.2f' % ss.median(col2))
print('Q1            = %.2f' % np.percentile(col1,25),'     %.2f' % np.percentile(col2,25))
print('Q2            = %.2f' % np.percentile(col1,50),'     %.2f' % np.percentile(col2,50))
print('Q3            = %.2f' % np.percentile(col1,75),'     %.2f' % np.percentile(col2,75))
print('Variance      = %.2f' % ss.variance(col1),'     %.2f' % ss.variance(col2))
print('Stand. dev.   = %.2f' % ss.stdev(col1),'     %.2f' % ss.stdev(col2))
print('Stand. error of the mean = %.2f' % s.sem(col1),'     %.2f' % s.sem(col2))        

# Box-and-Whisker plot 
plt.figure()
plt.boxplot([col1,col2], 1,' ')

# Scatter plot 
y_data=[np.random.random() for x in range (0, len(col1))]
plt.figure()
plt.scatter(col1, y_data, color="red", marker="^")

y_data=[np.random.random() for x in range (0, len(col2))]
plt.figure()
plt.scatter(col2, y_data, color="red", marker="+")


# Normal probability plot 
plt.figure()
s.probplot(col1, dist="norm", plot=pylab)
pylab.show()

plt.figure()
s.probplot(col2, dist="norm", plot=pylab)
pylab.show()

# Gaussian histogram
plt.hist(col1)
plt.title("Gaussian Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()

plt.hist(col2)
plt.title("Gaussian Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()

# Histogram
# histtype= normed=0 1 'bar' 'step', cumulative=0 1
bins1=round(1+3.222*math.log10(len(col1)))
bins2=round(1+3.222*math.log10(len(col2)))
plt.figure()
plt.hist(col1,bins1,normed=0,color='green',alpha=0.8, histtype='bar', cumulative=0)
plt.hist(col2,bins2,normed=0,color='yellow',alpha=0.8, histtype='bar', cumulative=0)
pylab.show()


# Normality test
print()
print("Normality tests (Sample 1): ")
print()
normed_datos=(col1-ss.mean(col1))/ss.stdev(col1)
kolmogorov_results=s.kstest(normed_datos,'norm')
print("Kolmogorov = ",kolmogorov_results)
shapiro_results=s.shapiro(col1)
print("Shapiro-Wilks = ",shapiro_results)
agostino_results=s.mstats.normaltest(col1)
print("D’Agostino = ",agostino_results)
anderson_results=s.anderson(normed_datos,'norm')
print("Anderson-Darling = ",anderson_results)
print()
print()
print("Normality tests (Sample 2): ")
print()
normed_datos=(col1-ss.mean(col2))/ss.stdev(col2)
kolmogorov_results=s.kstest(normed_datos,'norm')
print("Kolmogorov = ",kolmogorov_results)
shapiro_results=s.shapiro(col2)
print("Shapiro-Wilks = ",shapiro_results)
agostino_results=s.mstats.normaltest(col2)
print("D’Agostino = ",agostino_results)
anderson_results=s.anderson(normed_datos,'norm')
print("Anderson-Darling = ",anderson_results)
print()

print()
print("Test for homogeneity of variance: ")
print()
# F-test (Fisher test)
alpha = 0.05 
F = ss.variance(col1) / ss.variance(col2)
df1 = len(col1) - 1; df2 = len(col2) - 1
p_value1 = s.f.cdf(F, df1, df2)
# p_value2=s.f(df1, df2).cdf(F)  Equivalent expression
print('F-test:',F,'  p-value = ',2*p_value1);
print()
bart_result,p_value=s.bartlett(col1,col2)
print('Bartlett test: ',bart_result,' p-value = ',p_value)
print()
levene_result,p_value=s.levene(col1,col2)
print('Levene test: ',levene_result,' p-value = ',p_value)
    
    
# Two samples t-test
# test assumes the two groups have the same variance...
t_stat, p_value = s.ttest_ind(col1, col2)
# test assumes the two groups have different variances...
t_stat_u, p_value_u = s.ttest_ind(col1, col2, equal_var=False)
print()
print('Test for means:')
print("==============================================================")
print("t-test (equal variance) : ",t_stat," ",p_value)
if p_value>=0.05:
    print("ACCEPT H0")
else:
    print("REJECT H0")
print()
print("t-test (different variance) : ",t_stat_u," ",p_value_u)
if p_value_u>=0.05:
    print("ACCEPT H0")
else:
    print("REJECT H0")
print()
# Two samples Mann Whitney U test
u_stat, p_value = s.mannwhitneyu(col1,col2)
print("Wilcoxon test (Mann Whitney U test): ",u_stat," ",p_value)
if p_value>=0.05:
    print("ACCEPT H0")
else:
    print("REJECT H0")
print()