############################################################ # # # Virtual Laboratory of Geomathematics in Python # # # # General Linear Models: Linear Regression (01.08.2017) # # # # VLG Team, Complutense University of Madrid, Spain # # # # THIS SCRIPT IS PROVIDED BY THE AUTHORS "AS IS" AND # # CAN BE USED BY ANYONE FOR THE PURPOSES OF EDUCATION # # AND RESEARCH. # # # ############################################################ import math import numpy as np import scipy.stats as s import statistics as ss import statsmodels.api as sm import matplotlib.pyplot as plt import pylab # LINEAR REGRESSION # Example absorption of phosphate in a plant: assimilated phosphate (col2) and # phosphate in soil (col1) col1, col2 = np.loadtxt('phosphate.dat', unpack=True) # Box-and-Whisker plot plt.figure() plt.boxplot([col1,col2], 1,' ') # Scatter plot y_data=[np.random.random() for x in range (0, len(col1))] plt.figure() plt.scatter(col1, y_data, color="red", marker="^") y_data=[np.random.random() for x in range (0, len(col2))] plt.figure() plt.scatter(col2, y_data, color="red", marker="+") # Normal probability plot plt.figure() s.probplot(col1, dist="norm", plot=pylab) pylab.show() plt.figure() s.probplot(col2, dist="norm", plot=pylab) pylab.show() # Gaussian histogram plt.hist(col1) plt.title("Gaussian Histogram") plt.xlabel("Value") plt.ylabel("Frequency") plt.show() plt.hist(col2) plt.title("Gaussian Histogram") plt.xlabel("Value") plt.ylabel("Frequency") plt.show() # Histogram # histtype= normed=0 1 'bar' 'step', cumulative=0 1 bins1=round(1+3.222*math.log10(len(col1))) bins2=round(1+3.222*math.log10(len(col2))) plt.figure() plt.hist(col1,bins1,normed=0,color='green',alpha=0.8, histtype='bar', cumulative=0) plt.hist(col2,bins2,normed=0,color='yellow',alpha=0.8, histtype='bar', cumulative=0) pylab.show() # Normality test print() print("Normality tests (Variable 1): ") print() normed_datos=(col1-ss.mean(col1))/ss.stdev(col1) kolmogorov_results=s.kstest(normed_datos,'norm') print("Kolmogorov = ",kolmogorov_results) shapiro_results=s.shapiro(col1) print("Shapiro-Wilks = ",shapiro_results) agostino_results=s.mstats.normaltest(col1) print("D’Agostino = ",agostino_results) anderson_results=s.anderson(normed_datos,'norm') print("Anderson-Darling = ",anderson_results) print() print() print("Normality tests (Variable 2): ") print() normed_datos=(col2-ss.mean(col2))/ss.stdev(col2) kolmogorov_results=s.kstest(normed_datos,'norm') print("Kolmogorov = ",kolmogorov_results) shapiro_results=s.shapiro(col2) print("Shapiro-Wilks = ",shapiro_results) agostino_results=s.mstats.normaltest(col2) print("D’Agostino = ",agostino_results) anderson_results=s.anderson(normed_datos,'norm') print("Anderson-Darling = ",anderson_results) print() print() # Simple linear regression print() # Method 1 slope_result, intercept_result, r_value, p_value, std_err_result=s.linregress(col1,col2) print ('Linear regression ') print ('===============================') print ('slope:', slope_result) print ('intercept:', intercept_result) print ('correlation coefficient:', r_value) print ('p-value:', p_value) print ('Standard error of the estimate:',std_err_result , '\n') print() print ('===============================') print() # Method 2: Note that X=col1 e Y=col2 print() results = sm.OLS(col2,sm.add_constant(col1)).fit() print (results.summary()) plt.scatter(col1,col2) m, b = np.polyfit(col1, col2, deg=1) plt.plot(col1, col2, '.') plt.plot(col1, m*col1 + b, 'blue') plt.show()