I was writing a small piece of code to portray the difference between scaled and unscaled data when doing PCA and (as expected) I found that the separability of classes was better by doing the analysis over scaled data. I also plotted the cumulative explained variance ratio to portray the effect of less components explaining more variance and I found that the unscaled data had a steep value of cumulative explained variance for two components while for scaled data there was no component from which the next components only added marginally to the total explained variance ratio. These were the two plots I obtained, which seem contradictory for me
I expected the first plot to be the other way around (the scaled data being the one with the steep ascent in the cumulative explained variance ratio) but I'm not sure if this was only a mistake in my coding or something else I can't see.
This is the code that generated both plots
from sklearn.datasets import load_wine
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.decomposition import PCA
# importing data
wine = load_wine()
X = wine.data
y = wine.target
# creating standardized version of data
scaler = StandardScaler()
scaler.fit(X)
Xs = scaler.transform(X)
# splitting into train and test
strat_splitter = StratifiedShuffleSplit(test_size=0.2, n_splits=1)
for train_index, test_index in strat_splitter.split(X, y):
X_train, X_test = X[train_index], X[test_index]
Xs_train, Xs_test = Xs[train_index], Xs[test_index]
y_train, y_test = y[train_index], y[test_index]
# PCA of scaled and unscaled
unscaled_data_pca = PCA().fit(X_train)
scaled_data_pca = PCA().fit(Xs_train)
mpl.rcParams['figure.dpi'] = 150
fig, (ax1, ax2) = plt.subplots(1, 2, sharex=True)
fig.suptitle('Cumulative explained variance')
ax1.set(xlabel='$n_{comp}$', ylabel='$\%$ of cum. expl. variance', title = "scaled data", xticks = np.arange(1,13,1))
ax2.set(xlabel='$n_{comp}$', title = "unscaled data")
plt.subplots_adjust(hspace=0.5, wspace = 0.5)
for ax,data in [[ax1,scaled_data_pca], [ax2,unscaled_data_pca]]:
rng = np.arange(0,data.n_components_+1, 1)
csum = np.insert(data.explained_variance_ratio_, 0, 0)
ax.plot(rng, np.cumsum(csum))
ax.grid()
# using 2 components over scaled and unscaled data
projected_data_unscaled = PCA(2).fit_transform(X_train)
projected_data_scaled = PCA(2).fit_transform(Xs_train)
fig_proj, (ax_proj_scaled, ax_proj_unscaled) = plt.subplots(1,2)
fig_proj.suptitle("Proyected data")
for ax,data in [[ax_proj_scaled,projected_data_scaled], [ax_proj_unscaled,projected_data_unscaled]]:
ax.scatter(data[:,0], data[:,1], c=pd.Categorical(y_train))
ax.set_aspect('equal')
ax_proj_scaled.set_title("Projection of scaled data")
ax_proj_unscaled.set_title("Projection of unscaled data")
plt.show()
EDIT: Added graphs with the same scale and updated code

