import numpy as np
import pandas as pd
from scipy import stats

df = pd.read_csv('../data/prostate/prostate.data', delimiter='\t', index_col=0)
mask_train = df.pop('train')
df_y = df.pop('lpsa')

df.head()
lcavol lweight age lbph svi lcp gleason pgg45
1 -0.579818 2.769459 50 -1.386294 0 -1.386294 6 0
2 -0.994252 3.319626 58 -1.386294 0 -1.386294 6 0
3 -0.510826 2.691243 74 -1.386294 0 -1.386294 7 20
4 -1.203973 3.282789 58 -1.386294 0 -1.386294 6 0
5 0.751416 3.432373 62 -1.386294 0 -1.386294 6 0
# Table 3.1. Correlations of predictors in the prostate cancer data
df[mask_train == 'T'].corr()
lcavol lweight age lbph svi lcp gleason pgg45
lcavol 1.000000 0.300232 0.286324 0.063168 0.592949 0.692043 0.426414 0.483161
lweight 0.300232 1.000000 0.316723 0.437042 0.181054 0.156829 0.023558 0.074166
age 0.286324 0.316723 1.000000 0.287346 0.128902 0.172951 0.365915 0.275806
lbph 0.063168 0.437042 0.287346 1.000000 -0.139147 -0.088535 0.032992 -0.030404
svi 0.592949 0.181054 0.128902 -0.139147 1.000000 0.671240 0.306875 0.481358
lcp 0.692043 0.156829 0.172951 -0.088535 0.671240 1.000000 0.476437 0.662533
gleason 0.426414 0.023558 0.365915 0.032992 0.306875 0.476437 1.000000 0.757056
pgg45 0.483161 0.074166 0.275806 -0.030404 0.481358 0.662533 0.757056 1.000000
""" TABLE 3.2. Linear model fit to the prostate cancer data. The Z score is the
coefficient divided by its standard error (3.12). Roughly a Z score larger than two
in absolute value is significantly nonzero at the p = 0.05 level.
"""
class LinearRegression:
    def fit(self, X, y):
        X = np.c_[np.ones((X.shape[0], 1)), X]
        XX_inv = np.linalg.inv(X.T @ X)
        self.beta = XX_inv @ X.T @ y
        
        var = np.sum((X @ self.beta - y)**2) / (X.shape[0] - X.shape[1])
        self.stderr = np.sqrt(np.diag(XX_inv * var))
        self.z_score = self.beta / self.stderr

        return self
        
    def predict(self, X):
        X = np.c_[np.ones((X.shape[0], 1)), X]
        return X @ self.beta
    
df = df.apply(stats.zscore)
train_x = df[mask_train == 'T']
train_y = df_y[mask_train == 'T']

model = LinearRegression().fit(train_x.values, train_y.values)

pd.DataFrame(data = {'Coefficient': model.beta, 
                     'Std. Error': model.stderr,
                     'Z Score' : model.z_score}, 
             index = ["Intercept", *df.columns.tolist()])
Coefficient Std. Error Z Score
Intercept 2.464933 0.089315 27.598203
lcavol 0.676016 0.125975 5.366290
lweight 0.261694 0.095134 2.750789
age -0.140734 0.100819 -1.395909
lbph 0.209061 0.101691 2.055846
svi 0.303623 0.122962 2.469255
lcp -0.287002 0.153731 -1.866913
gleason -0.021195 0.144497 -0.146681
pgg45 0.265576 0.152820 1.737840
rss1 = sum((model.predict(train_x.values) - train_y) ** 2)

train_x_hyp = train_x.drop(columns = ['age', 'lcp', 'gleason', 'pgg45'])
model_hyp = LinearRegression().fit(train_x_hyp.values, train_y.values)

rss0 = sum((model_hyp.predict(train_x_hyp) - train_y) ** 2)

dfn = train_x.shape[1] - train_x_hyp.shape[1]
dfd = train_x.shape[0] - train_x.shape[1] - 1
f_stats = ((rss0 - rss1) / dfn) / (rss1 / dfd)
prob = 1 - stats.f.cdf(f_stats, dfn = dfn, dfd = dfd)

print ('RSS1 = ', rss1)
print ('RSS0 = ', rss0)
print ('F =', f_stats)
print (f'Pr(F({dfn}, {dfd}) > {f_stats:.2f}) = {prob:.2f}')
RSS1 =  29.426384459908398
RSS0 =  32.81499474881554
F = 1.6697548846375183
Pr(F(4, 58) > 1.67) = 0.17
test_x = df[mask_train == 'F']
test_y = df_y[mask_train == 'F']

test_error = np.mean((test_y - model.predict(test_x)) ** 2)
base_error = np.mean((test_y - np.mean(train_y)) ** 2)
print ('Prediction error: ', test_error)
print ('Base error rate: ', base_error)
Prediction error:  0.5212740055076002
Base error rate:  1.056733228060382