############################################################ # # # Virtual Laboratory of Geomathematics in Python # # # # Inferencial statistics with two populations (25.06.2017) # # # # VLG Team, Complutense University of Madrid, Spain # # # # THIS SCRIPT IS PROVIDED BY THE AUTHORS "AS IS" AND # # CAN BE USED BY ANYONE FOR THE PURPOSES OF EDUCATION # # AND RESEARCH. # # # ############################################################ import math import numpy as np import scipy.stats as s import statistics as ss import matplotlib.pyplot as plt import pylab # TWO POPULATIONS # Example James River flow comparison in two stations col1, col2 = np.loadtxt('River.dat', unpack=True) # Summary statistics print(' Group 1 Group 2 ') print('================================') print('n =',len(col1),' ',len(col2)) print('Minimum = %.2f' % min(col1),' %.2f' %min(col2)) print('Maximum = %.2f' % max(col1),' %.2f' % max(col2)) print('Rank = %.2f' % (max(col1)-min(col1)),' %.2f' % (max(col2)-min(col2))) print('Average = %.2f' % ss.mean(col1),' %.2f' % ss.mean(col2)) print('Median = %.2f' % ss.median(col1),' %.2f' % ss.median(col2)) print('Q1 = %.2f' % np.percentile(col1,25),' %.2f' % np.percentile(col2,25)) print('Q2 = %.2f' % np.percentile(col1,50),' %.2f' % np.percentile(col2,50)) print('Q3 = %.2f' % np.percentile(col1,75),' %.2f' % np.percentile(col2,75)) print('Variance = %.2f' % ss.variance(col1),' %.2f' % ss.variance(col2)) print('Stand. dev. = %.2f' % ss.stdev(col1),' %.2f' % ss.stdev(col2)) print('Stand. error of the mean = %.2f' % s.sem(col1),' %.2f' % s.sem(col2)) # Box-and-Whisker plot plt.figure() plt.boxplot([col1,col2], 1,' ') # Scatter plot y_data=[np.random.random() for x in range (0, len(col1))] plt.figure() plt.scatter(col1, y_data, color="red", marker="^") y_data=[np.random.random() for x in range (0, len(col2))] plt.figure() plt.scatter(col2, y_data, color="red", marker="+") # Normal probability plot plt.figure() s.probplot(col1, dist="norm", plot=pylab) pylab.show() plt.figure() s.probplot(col2, dist="norm", plot=pylab) pylab.show() # Gaussian histogram plt.hist(col1) plt.title("Gaussian Histogram") plt.xlabel("Value") plt.ylabel("Frequency") plt.show() plt.hist(col2) plt.title("Gaussian Histogram") plt.xlabel("Value") plt.ylabel("Frequency") plt.show() # Histogram # histtype= normed=0 1 'bar' 'step', cumulative=0 1 bins1=round(1+3.222*math.log10(len(col1))) bins2=round(1+3.222*math.log10(len(col2))) plt.figure() plt.hist(col1,bins1,normed=0,color='green',alpha=0.8, histtype='bar', cumulative=0) plt.hist(col2,bins2,normed=0,color='yellow',alpha=0.8, histtype='bar', cumulative=0) pylab.show() # Normality test print() print("Normality tests (Sample 1): ") print() normed_datos=(col1-ss.mean(col1))/ss.stdev(col1) kolmogorov_results=s.kstest(normed_datos,'norm') print("Kolmogorov = ",kolmogorov_results) shapiro_results=s.shapiro(col1) print("Shapiro-Wilks = ",shapiro_results) agostino_results=s.mstats.normaltest(col1) print("D’Agostino = ",agostino_results) anderson_results=s.anderson(normed_datos,'norm') print("Anderson-Darling = ",anderson_results) print() print() print("Normality tests (Sample 2): ") print() normed_datos=(col1-ss.mean(col2))/ss.stdev(col2) kolmogorov_results=s.kstest(normed_datos,'norm') print("Kolmogorov = ",kolmogorov_results) shapiro_results=s.shapiro(col2) print("Shapiro-Wilks = ",shapiro_results) agostino_results=s.mstats.normaltest(col2) print("D’Agostino = ",agostino_results) anderson_results=s.anderson(normed_datos,'norm') print("Anderson-Darling = ",anderson_results) print() print() print("Test for homogeneity of variance: ") print() # F-test (Fisher test) alpha = 0.05 F = ss.variance(col1) / ss.variance(col2) df1 = len(col1) - 1; df2 = len(col2) - 1 p_value1 = s.f.cdf(F, df1, df2) # p_value2=s.f(df1, df2).cdf(F) Equivalent expression print('F-test:',F,' p-value = ',2*p_value1); print() bart_result,p_value=s.bartlett(col1,col2) print('Bartlett test: ',bart_result,' p-value = ',p_value) print() levene_result,p_value=s.levene(col1,col2) print('Levene test: ',levene_result,' p-value = ',p_value) # Two samples t-test # test assumes the two groups have the same variance... t_stat, p_value = s.ttest_ind(col1, col2) # test assumes the two groups have different variances... t_stat_u, p_value_u = s.ttest_ind(col1, col2, equal_var=False) print() print('Test for means:') print("==============================================================") print("t-test (equal variance) : ",t_stat," ",p_value) if p_value>=0.05: print("ACCEPT H0") else: print("REJECT H0") print() print("t-test (different variance) : ",t_stat_u," ",p_value_u) if p_value_u>=0.05: print("ACCEPT H0") else: print("REJECT H0") print() # Two samples Mann Whitney U test u_stat, p_value = s.mannwhitneyu(col1,col2) print("Wilcoxon test (Mann Whitney U test): ",u_stat," ",p_value) if p_value>=0.05: print("ACCEPT H0") else: print("REJECT H0") print()