1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
|
############################################################
# #
# Virtual Laboratory of Statistics in Python #
# #
# General Linear Models: Linear Regression (01.08.2017) #
# #
# Complutense University of Madrid, Spain #
# #
# THIS SCRIPT IS PROVIDED BY THE AUTHORS "AS IS" AND #
# CAN BE USED BY ANYONE FOR THE PURPOSES OF EDUCATION #
# AND RESEARCH. #
# #
############################################################
import math
import numpy as np import scipy.stats as s import statistics as ss import statsmodels.api as sm
import matplotlib.pyplot as plt
import pylab
# LINEAR REGRESSION
# Declare here the name of the data file
col1, col2 = np.loadtxt('datafile.dat', unpack=True)
# Box-and-Whisker plot
plt.figure() plt.boxplot([col1,col2], 1,' ')
# Scatter plot
y_data=[np.random.random() for x in range (0, len(col1))] plt.figure() plt.scatter(col1, y_data, color="red", marker="^")
y_data=[np.random.random() for x in range (0, len(col2))] plt.figure() plt.scatter(col2, y_data, color="red", marker="+")
# Normal probability plot
plt.figure() s.probplot(col1, dist="norm", plot=pylab) pylab.show()
plt.figure() s.probplot(col2, dist="norm", plot=pylab) pylab.show()
# Gaussian histogram
plt.hist(col1) plt.title("Gaussian Histogram") plt.xlabel("Value") plt.ylabel("Frequency") plt.show()
plt.hist(col2) plt.title("Gaussian Histogram") plt.xlabel("Value") plt.ylabel("Frequency") plt.show()
# Histogram
# histtype= normed=0 1 'bar' 'step', cumulative=0 1
bins1=round(1+3.222*math.log10(len(col1))) bins2=round(1+3.222*math.log10(len(col2))) plt.figure() plt.hist(col1,bins1,normed=0,color='green',alpha=0.8, histtype='bar', cumulative=0) plt.hist(col2,bins2,normed=0,color='yellow',alpha=0.8, histtype='bar', cumulative=0) pylab.show()
# Normality test
print() print("Normality tests (Variable 1): ") print() normed_datos=(col1-ss.mean(col1))/ss.stdev(col1) kolmogorov_results=s.kstest(normed_datos,'norm') print("Kolmogorov = ",kolmogorov_results) shapiro_results=s.shapiro(col1) print("Shapiro-Wilks = ",shapiro_results) agostino_results=s.mstats.normaltest(col1) print("D’Agostino = ",agostino_results) anderson_results=s.anderson(normed_datos,'norm') print("Anderson-Darling = ",anderson_results) print() print() print("Normality tests (Variable 2): ") print() normed_datos=(col2-ss.mean(col2))/ss.stdev(col2) kolmogorov_results=s.kstest(normed_datos,'norm') print("Kolmogorov = ",kolmogorov_results) shapiro_results=s.shapiro(col2) print("Shapiro-Wilks = ",shapiro_results) agostino_results=s.mstats.normaltest(col2) print("D’Agostino = ",agostino_results) anderson_results=s.anderson(normed_datos,'norm') print("Anderson-Darling = ",anderson_results) print() print()
# Simple linear regression
print() # Method 1
slope_result, intercept_result, r_value, p_value, std_err_result=s.linregress(col1,col2) print ('Linear regression ') print ('===============================') print ('slope:', slope_result) print ('intercept:', intercept_result) print ('correlation coefficient:', r_value) print ('p-value:', p_value) print ('Standard error of the estimate:',std_err_result , '\n') print() print ('===============================') print() # Method 2: Note that X=col1 e Y=col2
print() results = sm.OLS(col2,sm.add_constant(col1)).fit() print (results.summary()) plt.scatter(col1,col2) m, b = np.polyfit(col1, col2, deg=1) plt.plot(col1, col2, '.') plt.plot(col1, m*col1 + b, 'blue') plt.show()
|