############################################################ # # # Virtual Laboratory of Geomathematics in Python # # # # Inferential statistics with one population (18.07.2017) # # # # VLG Team, Complutense University of Madrid, Spain # # # # THIS SCRIPT IS PROVIDED BY THE AUTHORS "AS IS" AND # # CAN BE USED BY ANYONE FOR THE PURPOSES OF EDUCATION # # AND RESEARCH. # # # ############################################################ import math import numpy as np import scipy.stats as s import matplotlib.pyplot as plt import pylab # ONE POPULATION # Example calcium levels in water from a well located in the United States data=np.loadtxt('Ca.dat', skiprows=1) print(data) # Statistical summary n, min_max, mean, var, skew, kurt = s.describe(data) std=math.sqrt(var) print() print("==============================================================") print("n = ",n) print('Average = ',mean) print('Median = ',np.median(data)) print('Variance =',var) print('Stand. dev. = ',std) print('Stand. error of the mean =',std/math.sqrt(n)) print("==============================================================") print() # Box-and-Whisker plot plt.figure() plt.boxplot(data, 1, 'rs', 0) # Scatter plot y_data=[np.random.random() for x in range (0, len(data))] plt.figure() plt.scatter(data, y_data, color="red", marker="^") # Normal probability plot plt.figure() s.probplot(data, dist="norm", plot=pylab) pylab.show() # Gaussian histogram plt.hist(data) plt.title("Gaussian Histogram") plt.xlabel("Value") plt.ylabel("Frequency") plt.show() # Histogram # histtype= normed=0 1 'bar' 'step', cumulative=0 1 bins=round(1+3.222*math.log10(len(data))) plt.figure() plt.hist(data,bins,normed=0,color='green',alpha=0.8, histtype='bar', cumulative=0) pylab.show() # Normality test print() print("Normality tests: ") print() normed_data=(data-mean)/std kolmogorov_results=s.kstest(normed_data,'norm') print("Kolmogorov = ",kolmogorov_results) shapiro_results=s.shapiro(data) print("Shapiro-Wilks = ",shapiro_results) agostino_results=s.mstats.normaltest(data) print("D’Agostino = ",agostino_results) anderson_results=s.anderson(normed_data,'norm') print("Anderson-Darling = ",anderson_results) print() # t-test one population # Set up H0: Mean H0=85.26 print("H0 = ",H0," :") print() # test t_stat, p_value_t = s.ttest_1samp(data,H0) print("t-test : ",t_stat," p_value : ",p_value_t) if p_value_t>=0.05: print("ACCEPT H0: Mean = ",mean) else: print("REJECT H0: Mean = ",mean) print() # Wilcoxon Signed-Rank Test # H0: Median print("H0 = ",np.median(data)," :") print() # test w_stat, p_value_w = s.wilcoxon(data-np.median(data)) print("Wilcoxon test : ",w_stat," p_value : ",p_value_w) if p_value_w>=0.05: print("ACCEPT H0: Median = ",np.median(data)) else: print("REJECT H0: Median = :",np.median(data)) print() # Confidence Intervals def sdev_interval(alpha, std, data): df = n-1 chi2upper = math.sqrt(s.chi2.ppf(1-alpha/2.0, df)) chi2lower = math.sqrt(s.chi2.ppf(alpha/2.0, df)) LRL = std * math.sqrt(n)/ chi2upper URL = std * math.sqrt(n)/ chi2lower return (LRL, URL) def var_interval(alpha, var, data): LRL, URL = sdev_interval(alpha, math.sqrt(var), data) return (LRL*LRL, URL*URL) # Choose level of significance alpha=0.05 ci_mean = s.norm.interval(1-alpha,loc=mean,scale=std/math.sqrt(n)) print("Confidence intervals. Level of significance = ",alpha," :") print() print(" Mean : ",ci_mean) ci_sdev = sdev_interval(alpha, std, data) print(" Stand. dev. : ",ci_sdev) ci_var = var_interval(alpha,var,data) print(" Variance : ",ci_var) print()