In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
names = ['mpg', 'cylinders','displacement', 'horsepower',
         'weight', 'acceleration', 'model year', 'origin', 'car name']
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/'+
                 'auto-mpg/auto-mpg.data',
                 header=None,delim_whitespace=True,names=names,na_values='?')

df1 = df[['horsepower']]
df2 = df1.dropna()

Basic plotting as sanity check.

In [ ]:
plt.plot(x,y,'o')
plt.xlabel('horsepower')
plt.ylabel('mpg')
plt.grid(True)
In [ ]:
xm = np.mean(x)
ym = np.mean(y)
sxx = np.mean((x-xm)**2)
syy = np.mean((y-ym)**2)
syx = np.mean((y-ym)*(x-xm))

beta1 = syx/sxx
beta0 = ym - beta1*xm

print("xbar={0:.2f},ybar={1:.2f}".format(xm,ym))
print("sqrt(xx)={0:.2f},sqrt(yy)={1:.2f}".format(np.sqrt(sxx),np.sqrt(syy)))
print("beta0={0:.2f},beta1={1:.2f}".format(beta0,beta1))
xbar=104.47,ybar=23.45
sqrt(xx)=38.44,sqrt(yy)=7.80
beta0=39.94,beta1=-0.16
In [ ]:
xmodel = np.array([25,250])
ymodel = beta0 + beta1*xmodel

plt.plot(x,y,'o')
plt.plot(xmodel,ymodel,'-',linewidth=3)
plt.xlabel('horsepower')
plt.ylabel('mpg')
plt.grid(True)
In [ ]:
yhat = beta0 + beta1*x
SSE = np.mean((y - yhat)**2)
print(SSE)
23.943662938603108