I have implemented the idea of @RichardHardy, which is to estimate variance for each time series using a HAC variance estimator, and then perform a t-test. In particular, I generated some surrogate data using a markov chain
$$x(t+1) = \rho x(t) + \epsilon$$
where $\epsilon$ is a normal random variable. I also generated a dataset $y$ with exactly the same $\rho$ and same error distribution. If this method works, one of the most important requirements is that it must consistently fail to reject the null hypothesis if the null hypothesis is true. I have performed the variance estimation, followed by t-test for different data sizes. Here are the results:
- Naive variance estimator severely underestimates variance for high $\rho$ values, and is thus produces too many false positives
- HAC estimator also underestimates variance, but less so than naive. It becomes progressively better at increasing lags, until the estimate is very close to correct.
Selection of lag is a bit of black magic for me. If there is a rule of thumb for this, I'd appreciate any suggestions. Also, I've quickly looked and seen that there are newer publications that claim to be better than Newey-West that is implemented in StatsModels. Are these significantly better than Newey-West? If yes, where can these implementations be found?

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from scipy.stats import ttest_ind_from_stats
def autocorr(x, rho):
rez = np.copy(x)
for i in range(1, len(x)):
rez[i] = rez[i-1]rho + rez[i](1-rho)
return rez
def get_mean_std(x, hacLags=None):
df = pd.DataFrame({'x' : x})
model = smf.ols(formula='x ~ 1', data=df)
if hacLags is None:
results = model.fit()
else:
results = model.fit(cov_type='HAC',cov_kwds={'maxlags':hacLags})
mu = results.params['Intercept']
std = results.bse['Intercept'] * np.sqrt(len(x))
return mu, std
def ttest(x, y, hacLags=None):
muX, stdX = get_mean_std(x, hacLags=hacLags)
muY, stdY = get_mean_std(y, hacLags=hacLags)
t, p = ttest_ind_from_stats(muX, stdX, len(x), muY, stdX, len(y))
return t, p, muX, stdX, muY, stdY
def ttest_by_data_size(rho, hacLags):
muTrue = 1
stdTrue = 2
rezNaive = []
rezHAC = []
nDataArr = (10**np.linspace(1, 5, 40)).astype(int)
for nData in nDataArr:
x = np.random.normal(muTrue, stdTrue, nData)
x = autocorr(x, rho)
y = np.random.normal(muTrue, stdTrue, nData)
y = autocorr(y, rho)
rezNaive += [ttest(x, y)]
rezHAC += [ttest(x, y, hacLags=hacLags)]
rezNaive = np.array(rezNaive)
rezHAC = np.array(rezHAC)
fig, ax = plt.subplots(ncols=5, figsize=(20, 4), tight_layout=True)
fig.suptitle("rho = "+str(rho))
ax[0].semilogx(nDataArr, rezNaive[:, 2], label='Naive')
ax[0].semilogx(nDataArr, rezHAC[:, 2], label='HAC')
ax[1].semilogx(nDataArr, rezNaive[:, 4], label='Naive')
ax[1].semilogx(nDataArr, rezHAC[:, 4], label='HAC')
ax[2].semilogx(nDataArr, rezNaive[:, 3], label='Naive')
ax[2].semilogx(nDataArr, rezHAC[:, 3], label='HAC')
ax[3].semilogx(nDataArr, rezNaive[:, 5], label='Naive')
ax[3].semilogx(nDataArr, rezHAC[:, 5], label='HAC')
ax[4].loglog(nDataArr, rezNaive[:, 1], label='naive')
ax[4].loglog(nDataArr, rezHAC[:, 1], label='hac')
ax[0].axhline(y=muTrue, linestyle='--', color='r', label='true')
ax[1].axhline(y=muTrue, linestyle='--', color='r', label='true')
ax[2].axhline(y=stdTrue, linestyle='--', color='r', label='true')
ax[3].axhline(y=stdTrue, linestyle='--', color='r', label='true')
ax[4].axhline(y=0.01, linestyle='--', color='r', label='significant')
ax[0].legend()
ax[1].legend()
ax[2].legend()
ax[3].legend()
ax[4].legend()
ax[0].set_xlabel('Data Size')
ax[1].set_xlabel('Data Size')
ax[2].set_xlabel('Data Size')
ax[3].set_xlabel('Data Size')
ax[4].set_xlabel('Data Size')
ax[0].set_ylabel('X-Mean')
ax[1].set_ylabel('Y-Mean')
ax[2].set_ylabel('X-Std')
ax[3].set_ylabel('Y-Std')
ax[4].set_ylabel('PValue')
plt.show()
ttest_by_data_size(0, 1)
ttest_by_data_size(0.1, 1)
ttest_by_data_size(0.9, 1)
ttest_by_data_size(0.9, 10)
ttest_by_data_size(0.9, 100)