############################################################
#                                                          #
#         Virtual Laboratory of Statistics in Python       #
#                                                          #
#    General Linear Models: Linear Regression (01.08.2017) #
#                                                          #                
#         Complutense University of Madrid, Spain          #
#                                                          #
#   THIS SCRIPT IS PROVIDED BY THE AUTHORS "AS IS" AND     #
#   CAN BE USED BY ANYONE FOR THE PURPOSES OF EDUCATION    #
#   AND RESEARCH.                                          #
#                                                          #
############################################################
import math
import numpy as np 
import scipy.stats as s 
import statistics as ss 
import statsmodels.api as sm
import matplotlib.pyplot as plt 

import pylab

# LINEAR REGRESSION
# Declare here the name of the data file
col1, col2 = np.loadtxt('datafile.dat', unpack=True)

# Box-and-Whisker plot 
plt.figure()
plt.boxplot([col1,col2], 1,' ')

# Scatter plot 
y_data=[np.random.random() for x in range (0, len(col1))]
plt.figure()
plt.scatter(col1, y_data, color="red", marker="^")

y_data=[np.random.random() for x in range (0, len(col2))]
plt.figure()
plt.scatter(col2, y_data, color="red", marker="+")

# Normal probability plot 
plt.figure()
s.probplot(col1, dist="norm", plot=pylab)
pylab.show()

plt.figure()
s.probplot(col2, dist="norm", plot=pylab)
pylab.show()

# Gaussian histogram
plt.hist(col1)
plt.title("Gaussian Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()

plt.hist(col2)
plt.title("Gaussian Histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()

# Histogram
# histtype= normed=0 1 'bar' 'step', cumulative=0 1
bins1=round(1+3.222*math.log10(len(col1)))
bins2=round(1+3.222*math.log10(len(col2)))
plt.figure()
plt.hist(col1,bins1,normed=0,color='green',alpha=0.8, histtype='bar', cumulative=0)
plt.hist(col2,bins2,normed=0,color='yellow',alpha=0.8, histtype='bar', cumulative=0)
pylab.show()


# Normality test
print()
print("Normality tests (Variable 1): ")
print()
normed_datos=(col1-ss.mean(col1))/ss.stdev(col1)
kolmogorov_results=s.kstest(normed_datos,'norm')
print("Kolmogorov = ",kolmogorov_results)
shapiro_results=s.shapiro(col1)
print("Shapiro-Wilks = ",shapiro_results)
agostino_results=s.mstats.normaltest(col1)
print("D’Agostino = ",agostino_results)
anderson_results=s.anderson(normed_datos,'norm')
print("Anderson-Darling = ",anderson_results)
print()
print()
print("Normality tests (Variable 2): ")
print()
normed_datos=(col2-ss.mean(col2))/ss.stdev(col2)
kolmogorov_results=s.kstest(normed_datos,'norm')
print("Kolmogorov = ",kolmogorov_results)
shapiro_results=s.shapiro(col2)
print("Shapiro-Wilks = ",shapiro_results)
agostino_results=s.mstats.normaltest(col2)
print("D’Agostino = ",agostino_results)
anderson_results=s.anderson(normed_datos,'norm')
print("Anderson-Darling = ",anderson_results)
print()
print()

# Simple linear regression
print()
# Method 1
slope_result, intercept_result, r_value, p_value, std_err_result=s.linregress(col1,col2)
print ('Linear regression ')
print ('===============================')
print ('slope:', slope_result)
print ('intercept:', intercept_result)
print ('correlation coefficient:', r_value)
print ('p-value:', p_value)
print ('Standard error of the estimate:',std_err_result , '\n')
print()
print ('===============================')
print()
# Method 2: Note that X=col1 e Y=col2
print()
results = sm.OLS(col2,sm.add_constant(col1)).fit()
print (results.summary())
plt.scatter(col1,col2)
m, b = np.polyfit(col1, col2, deg=1)
plt.plot(col1, col2, '.')
plt.plot(col1, m*col1 + b, 'blue')
plt.show()