I'm new to reinforcement learning and the code I'm running was created with the help of an advisor and based on the DDPG pendulum from keras.io code. The idea is to guide a robot along a space to try and find the source of a chemical plume. It runs alongside a simulator made in C++ that graphically shows the plume of smoke and the position of the robot. My issue is that the code is not learning and the average reward is all over the place as in the image attached after 4000 episodes.
What could I do to imporve this? the simualtion has no issues and my concenr is the reinforcement learnin part
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import random
import zmq
from typing import Tuple, List
import matplotlib.pyplot as plt
problem = "CartPole-v0"
env = gym.make(problem)
num_state = 5
num_actions = 5 # 5 discrete actions to select
context = zmq.Context()
socket = context.socket(zmq.REQ)
port = "5555"
socket.connect("tcp://localhost:%s" % port)
mimic the env.reset() function in openAI gym
in the env_reset function, python sends a message to C++ simulator to reset the environment
and get back the state, reward, done information
def env_reset():
send_message = "heading_command reset_flag"
send_message = "0 1"
socket.send_string(send_message)
recv_message = socket.recv().split()
convert string list to float list
float_message = []
for item in recv_message:
float_message.append(float(item))
float_message[0:7] are states, which include:
Sim_time, windSpeed_x, windSpeed_y, Con, Pos_x, Pos_y, vehSpeed_x, vehSpeed_y, plume_non_detect
Sim_time = float_message[0]
windSpeed_x = float_message[1]
windSpeed_y = float_message[2]
con = float_message[3]
posx = float_message[4]
posy = float_message[5]
vx = float_message[6]
vy = float_message[7]
plume_non_detect = float_message[8]
observations = np.array([windSpeed_x, windSpeed_y, vx, vy, plume_non_detect])
return observations
def env_step(action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""Returns state, reward, done flag given an action = env_step(action)"""
state, reward, done, _ = env.step(action)
mimicking the env.step function, python sends the action to the C++ simulator
and receives states, reward, and done information
send_message includes the chosen action and reset flag "{action} reset_flag"
in env_step function, reset flag is always 0 since we don't want to reset the environment
send_message = f"{action} 0"
socket.send_string(send_message)
recv_message includes the returned states, reward, done information
recv_message = socket.recv().split()
convert string list to float list
float_message = []
for item in recv_message:
float_message.append(float(item))
float_message[0:7] are states, which include:
Sim_time, windSpeed_x, windSpeed_y, Con, Pos_x, Pos_y, vehSpeed_x, vehSpeed_y, plume_non_detect
Sim_time = float_message[0]
windSpeed_x = float_message[1]
windSpeed_y = float_message[2]
con = float_message[3]
posx = float_message[4]
posy = float_message[5]
vx = float_message[6]
vy = float_message[7]
plume_non_detect = float_message[8]
sim_state = np.array([windSpeed_x, windSpeed_y, vx, vy, plume_non_detect])
sim_reward = np.array(float_message[9])
sim_done = np.array(float_message[10])
old_vx = vx
old_vy = vy
return(sim_state.astype(np.float32), # state is in the float type
np.array(sim_reward, np.int32), # reward is in the int type
np.array(sim_done, np.int32)) # done is in the int type
the buffer class implements experience replay
class Buffer:
def init(self, buffer_capacity=10000, batch_size=64):
self.buffer_capacity = buffer_capacity
self.batch_size = batch_size
self.buffer_counter = 0
self.state_buffer = np.zeros((self.buffer_capacity, num_state))
self.action_buffer = np.zeros((self.buffer_capacity, 1), dtype=int)
self.reward_buffer = np.zeros((self.buffer_capacity, 1))
self.next_state_buffer = np.zeros((self.buffer_capacity, num_state))
self.done_buffer = np.zeros((self.buffer_capacity, 1))
take (s,a,r,s') observation tuple as the input
def record(self, obs_tuple):
index = self.buffer_counter % self.buffer_capacity
self.state_buffer[index] = obs_tuple[0]
self.action_buffer[index] = obs_tuple[1]
self.reward_buffer[index] = obs_tuple[2]
self.next_state_buffer[index] = obs_tuple[3]
self.done_buffer[index] = obs_tuple[4]
self.buffer_counter += 1
def update(self, state_batch, action_batch, reward_batch, next_state_batch, done_batch):
current_Q = q_net(state_batch) # shape [batch_size, num_action]
target_Q = np.copy(current_Q) # shape [batch_size, num_action]
# next_Q = target_q_net(next_state_batch) # shape [batch_size, num_action]
# max_next_Q = np.amax(next_Q, axis=1) # shape [batch_size, 1]
next_Q = q_net(next_state_batch)
next_optimal_action = np.argmax(next_Q, axis=1)
next_Q_target = target_q_net(next_state_batch)
for i in range(self.batch_size):
target_Q[i][action_batch[i][0]] = reward_batch[i] if done_batch[i] else reward_batch[i] + gamma * \
next_Q_target[i][next_optimal_action[i]]
q_net.fit(x=state_batch, y=target_Q, verbose=0)
def learn(self):
record_range = min(self.buffer_counter, self.buffer_capacity)
batch_indices = np.random.choice(record_range, self.batch_size) # generates a random sample from a 1-D array
# choice(a, size): a: 1-D array; size: an integer
# convert to tensors
state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
action_batch = self.action_buffer[batch_indices]
reward_batch = self.reward_buffer[batch_indices]
next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
done_batch = self.done_buffer[batch_indices]
self.update(state_batch, action_batch, reward_batch, next_state_batch, done_batch)
def get_critic():
filter_size = 64
state_input = layers.Input(shape=(num_state))
state_out = layers.Dense(filter_size, activation='relu')(state_input)
state_out = layers.Dense(filter_size, activation='relu')(state_out)
state_out = layers.Dense(filter_size, activation='relu')(state_out)
action_out = layers.Dense(num_actions, activation='linear')(state_out)
model = tf.keras.Model(state_input, action_out)
model.compile(optimizer=tf.optimizers.Adam(learning_rate=1e-6), loss='mse')
return model
def policy(state, epsilon=0.1):
action_q = q_net(state) # action_q is a vector with "number_of_actions" elements
if np.random.random() <= epsilon: # np.random.random() returns a random float number between 0 and 1
return random.randrange(num_actions) # random.randrange returns a random selected element from the range
else:
return np.argmax(action_q.numpy()[0], axis=0)
q_net = get_critic()
target_q_net = get_critic()
target_q_net.set_weights(q_net.get_weights())
total_episodes = 1000
gamma = 0.99
buffer = Buffer(10000, 64)
store reward history of each episode
ep_reward_list = []
store average reward of last few episode
avg_reward_list = []
for ep in range(total_episodes):
prev_state = env_reset()
episodic_reward = 0
time_count = 0
while True:
tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)
action = policy(tf_prev_state)
state, reward, done = env_step(action)
buffer.record((prev_state, action, reward, state, done))
episodic_reward += reward
buffer.learn()
# update target_Q network in every 20 steps
if time_count % 20 == 0:
target_q_net.set_weights(q_net.get_weights())
if done:
break
prev_state = state
time_count += 1
ep_reward_list.append(episodic_reward)
avg_reward = np.mean(ep_reward_list[-40:])
print("Episode * {} * Episodic Reward is ==> {} and Averaged Reward is ==> {}".format(ep, episodic_reward,
avg_reward))
avg_reward_list.append(avg_reward)
model_path = 'C:/Users/Ma/Desktop/GRP'
q_net.save(model_path+'q_net.h5', include_optimizer=False)
np.savetxt(model_path+'ep_reward.txt', ep_reward_list)
np.savetxt(model_path+'avg_reward.txt', avg_reward_list)
plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()