I would like to resample a df so that it has an index entry for all the working hours of the days contained in the datetimeindex of the df. I have some code, but I think this should be achived easier and that I am missing something. Also it will blow up on large dataframes because the from_product() will create also filter_id and tr_id combinations not present in the data. So basically I what new datetimeindex but only for filter (filter_id) and transactions (tr_id) already present on that day.
import pandas as pd
import numpy as np
import random
import datetime
def dummy_sample():
ind = ['filter_id','tr_id','adate', 'amin', 'amax', 'aavg', 'avar', 'acnt']
sub = pd.DataFrame(np.random.randn(32,8), columns=ind)
sub.loc[:,'adate'] = pd.date_range('2018-01-01 05:00:00', periods=2, freq='15T').repeat(16)
sub.loc[:,'filter_id'] = [1,2,3,4]*4*2
sub.loc[:,'tr_id'] = ([448]*4+[1]*4+[2]*4+[3]*4)*2
sub.set_index(['adate','filter_id','tr_id'],inplace=True)
base_trs = sub.loc[(slice(None),slice(None),[1,2,3]),'acnt'].shape[0]
sub.loc[(slice(None),slice(None),[1,2,3]),'acnt'] = [random.randint(0,100) for i in range(base_trs)]
# special tr_id 448 is the sum of all other
for i in sub.groupby(['adate','filter_id']):
(date,filter_id),df=i
df = df.droplevel(['adate','filter_id'])
sub.loc[(date,filter_id,448),'acnt'] = df.loc[[1,2,3],'acnt'].sum()
# special filter_id 1 is the sum of all filter all tr_ids
for i in sub.groupby('adate'):
date,df=i
sub.loc[(date,1,448),'acnt'] = df.loc[(date,[2,3,4],448),'acnt'].sum()
return sub
def reg_sample_soa(df):
loc_df = df.copy()
loc_df.index = df.index.remove_unused_levels()
days = np.unique(loc_df.index.levels[0].date)
#print(days)
index = pd.to_datetime([])
for day in days:
# this is for the corona SOA from 4 to 21
start = datetime.datetime.combine(day,datetime.time(4,0))
#print(start)
soa_slots = pd.date_range(start, periods=(21-4)*4,freq='15min')
#print(soa_slots)
index = index.append(pd.MultiIndex.from_product([soa_slots,loc_df.index.levels[1],loc_df.index.levels[2]]))
#print(index)
output = pd.DataFrame(index=pd.MultiIndex.from_tuples(index), columns=loc_df.columns)
#print(output.info())
output.update(loc_df)
return output
#r = awzpy.inout.read_dump('../data/test_m.dat')
r = dummy_sample()
#print(r)
t = reg_sample_soa(r)
print(t.loc[('2018-01-01 05',slice(None),slice(None)),:])
I read about resample().asfreq() solutions but they did not work for me because of the night gaps and weekends. Looking forward to your comments and hints. Thank you!
p.s.: I am using python 3.6.8 and pandas 1.0.1
edit: The best links I found are : Resampling a pandas MultiIndex dataframe
Resampling Within a Pandas MultiIndex can this be applied to my case?