I am trying to create my own linear regression model in Python. I have a working model, but when I try to add a preprocessing function that scales the feature vectors I get incorrect weights.
Below is my code:
import numpy as np
import pandas as pd
df = pd.read_csv("test_data.csv")
class LinearRegression:
def gradient_decent(self, X, y, w, b, alpha):
"""
:param X: A pandas dataframe, such that each column represents an example and the rows represent the features
:param y: The labels. Each label corresponds to a column in the data (i.e. len(y) == len(X.cols))
:param w: An array of the weight of the model (i.e. len(w) == len(X.rows))
:param b: The bias. A constant.
:param alpha: The learning rate - aka the step size
:return:
"""
# Define N as the number of training examples
N = len(y)
# Initialise delta E / delta W as a vector of zeros - number of rows in the dataframe
delta_w = np.zeros(len(X[0]))
# Initialise delta E / delta b as 0
delta_b = 0
for i in range(N):
wx = sum(w * X[i])
delta_w += 2 * X[i] * (wx + b - y[i])
delta_b += wx + b - y[i]
w = w - ((2 / N) * alpha * delta_w)
b = b - ((2 / N) * alpha * delta_b)
return w, b
@staticmethod
def mean_square_error(X, y, w, b):
res = 0
N = len(y)
for i in range(N):
wx = sum(w * X[i])
res += (wx + b - y[i]) ** 2
return res / N
# ================== ERROR HERE ==================
@staticmethod
def feature_scaling(X, y):
"""
This function converts floating-point features from their natural range (e.g. from 100 to 900) into a standard
range (e.g. -3 to 3) by calculating the z-score.
:param X: The feature vectors (a Pandas DataFrame)
:param y: The labels (a Pandas DataFrame)
:return: The scaled data
"""
for col in X:
mean = X[col].mean()
std = X[col].std()
X[col] = X[col].apply(lambda x: (x - mean) / std)
y = y.apply(lambda x: (x - y.mean()) / y.std())
return X, y
# ================================================
def preprocessing(self, X, y):
X = X.transpose()
X, y = self.feature_scaling(X, y)
X = X.to_numpy()
y = y.to_numpy()
return X, y
def train(self, X, y, alpha, epochs, w=None, b=0):
X, y = self.preprocessing(X, y)
if w is None:
w = np.zeros(len(X[0]))
for e in range(epochs):
w, b = self.gradient_decent(X, y, w, b, alpha)
if e % 5000 == 0:
print('epochs:', e, 'loss:', self.mean_square_error(X, y, w, b), 'w:', w, 'b:', b)
print('---- FINAL ---')
return w, b, "Loss:", self.mean_square_error(X, y, w, b)
X = pd.DataFrame([df['x1'], df['x2']])
print(LinearRegression().train(X=X, y=df['y'], alpha=0.1, epochs=50000))
Without feature vector scaling my LinearRegression().train() returns:
w: [11.66585762 3.33397344], b: 23.331022792787955
Which are roughly the same values I would get if I used sklearn.linear_model.
However when I run the same code with feature_scaling(), my model returns:
w: [0.72353595 0.27347088], b: 8.326672684688619e-17
I am not too sure what the issue is: whether it's an issue with the code or whether its an issue with my maths, but any help at all on the matter is hugely appreciated!
Thanks in advance.
w = [11.6, 3.33]and notw = [0.72, 0.23]– jda5 May 14 '21 at 13:10