import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
names = ['mpg', 'cylinders','displacement', 'horsepower',
'weight', 'acceleration', 'model year', 'origin', 'car name']
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/'+
'auto-mpg/auto-mpg.data',
header=None,delim_whitespace=True,names=names,na_values='?')
df1 = df[['horsepower']]
df2 = df1.dropna()
Basic plotting as sanity check.
plt.plot(x,y,'o')
plt.xlabel('horsepower')
plt.ylabel('mpg')
plt.grid(True)
xm = np.mean(x)
ym = np.mean(y)
sxx = np.mean((x-xm)**2)
syy = np.mean((y-ym)**2)
syx = np.mean((y-ym)*(x-xm))
beta1 = syx/sxx
beta0 = ym - beta1*xm
print("xbar={0:.2f},ybar={1:.2f}".format(xm,ym))
print("sqrt(xx)={0:.2f},sqrt(yy)={1:.2f}".format(np.sqrt(sxx),np.sqrt(syy)))
print("beta0={0:.2f},beta1={1:.2f}".format(beta0,beta1))
xbar=104.47,ybar=23.45 sqrt(xx)=38.44,sqrt(yy)=7.80 beta0=39.94,beta1=-0.16
xmodel = np.array([25,250])
ymodel = beta0 + beta1*xmodel
plt.plot(x,y,'o')
plt.plot(xmodel,ymodel,'-',linewidth=3)
plt.xlabel('horsepower')
plt.ylabel('mpg')
plt.grid(True)
yhat = beta0 + beta1*x
SSE = np.mean((y - yhat)**2)
print(SSE)
23.943662938603108