0

I made all required preprocessing steps before train my model and when i trained it on linear regression it gaves me low accuracy so i went to ols model. And When i used ols model to fit my model it gave me 71.2% r-squared. but when i predict X_test using ols model and calculated r-squared using sklearn.metrices.r2_score it gave me 43.6% r-squared, why is that and what is the difference ?!

ols model r-squared normal r2-scaore function value

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from mlxtend.evaluate import bias_variance_decomp
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import LabelEncoder

# Read dataset
dataset = pd.read_csv('IPL IMB381IPL2013.csv')

# Check null values
stats = dataset.describe().T
inf = dataset.info() # No missing values

# Drop features
features_drop = ['Sl.NO.', 'PLAYER NAME', 'COUNTRY']
dataset = dataset.drop(features_drop, axis = 1)

# Check multicolinearity
cor = dataset.iloc[:,:-1].corr()
sns.heatmap(cor, annot = True)
plt.plot()

# Droped features for high corrleation
#---------------Above 50%---------------
del dataset['ODI-RUNS-S'] #-->
del dataset['ODI-WKTS']
del dataset['WKTS']
del dataset['SR-BL'] #-->
del dataset['RUNS-S'] #-->
del dataset['HS']    # -->
del dataset['AVE'] # -->
del dataset['CAPTAINCY EXP']
del dataset['ECON'] #-->
#---------------Under 50%---------------
del dataset['T-RUNS'] #-->
del dataset['SR-B']  # -->
del dataset['AVE-BL'] #-->
# T-WKTS / RUNS-C / ODI-SR-B


# Standardization
col_num = [i for i in dataset.columns if dataset[i].dtypes != "O"][:-1]
scaler = StandardScaler()
dataset_stand = dataset
dataset_stand[col_num] = scaler.fit_transform(dataset_stand[col_num])

# Encoding cat variabels 
col_cat = [i for i in dataset.columns if dataset[i].dtype == "O"]
print(dataset['TEAM'].value_counts())
print(dataset['PLAYING ROLE'].value_counts())

# Split dataset
cols = [i for i in dataset.columns if i != "SOLD PRICE"]
X = dataset[cols]
y = dataset['SOLD PRICE']

# Create dummy variables for cat variables 
Xy = X
X = Xy.iloc[:,:2]
X = pd.get_dummies(X)
cor = X.corr()
X = X.drop('TEAM_MI',axis = 1)
X['PLAYIN ROLE'] = Xy['PLAYING ROLE']
X = pd.get_dummies(X)
cor = X.corr()
X = X.drop('PLAYIN ROLE_Batsman',axis = 1)
add_feature = Xy.iloc[:,3:].columns
X[add_feature] = Xy[add_feature]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state=0)
X_train, X_test, y_train, y_test = X_train.values, X_test.values, y_train.values, y_test.values


# OLS model
from statsmodels import api as sm
ipl_model_1=sm.OLS(y_train,X_train).fit()
y_pred_ols = ipl_model_1.predict(X_train)
print("MES: ", np.sqrt(mean_squared_error(y_train, y_pred_ols)))
print("R2Score: ", r2_score(y_train, y_pred_ols))
ipl_model_1.summary2()
desertnaut
  • 52,940
  • 19
  • 125
  • 157

0 Answers0