0

I'm new to reinforcement learning and the code I'm running was created with the help of an advisor and based on the DDPG pendulum from keras.io code. The idea is to guide a robot along a space to try and find the source of a chemical plume. It runs alongside a simulator made in C++ that graphically shows the plume of smoke and the position of the robot. My issue is that the code is not learning and the average reward is all over the place as in the image attached after 4000 episodes.

What could I do to imporve this? the simualtion has no issues and my concenr is the reinforcement learnin partenter image description here

import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import random
import zmq
from typing import Tuple, List
import matplotlib.pyplot as plt

problem = "CartPole-v0"

env = gym.make(problem)

num_state = 5 num_actions = 5 # 5 discrete actions to select

context = zmq.Context() socket = context.socket(zmq.REQ) port = "5555" socket.connect("tcp://localhost:%s" % port)

mimic the env.reset() function in openAI gym

in the env_reset function, python sends a message to C++ simulator to reset the environment

and get back the state, reward, done information

def env_reset():

send_message = "heading_command reset_flag"

send_message = "0 1" socket.send_string(send_message) recv_message = socket.recv().split()

convert string list to float list

float_message = [] for item in recv_message: float_message.append(float(item))

float_message[0:7] are states, which include:

Sim_time, windSpeed_x, windSpeed_y, Con, Pos_x, Pos_y, vehSpeed_x, vehSpeed_y, plume_non_detect

Sim_time = float_message[0] windSpeed_x = float_message[1] windSpeed_y = float_message[2] con = float_message[3] posx = float_message[4] posy = float_message[5] vx = float_message[6] vy = float_message[7] plume_non_detect = float_message[8]

observations = np.array([windSpeed_x, windSpeed_y, vx, vy, plume_non_detect])

return observations

def env_step(action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """Returns state, reward, done flag given an action = env_step(action)"""

state, reward, done, _ = env.step(action)

mimicking the env.step function, python sends the action to the C++ simulator

and receives states, reward, and done information

send_message includes the chosen action and reset flag "{action} reset_flag"

in env_step function, reset flag is always 0 since we don't want to reset the environment

send_message = f"{action} 0" socket.send_string(send_message)

recv_message includes the returned states, reward, done information

recv_message = socket.recv().split()

convert string list to float list

float_message = [] for item in recv_message: float_message.append(float(item))

float_message[0:7] are states, which include:

Sim_time, windSpeed_x, windSpeed_y, Con, Pos_x, Pos_y, vehSpeed_x, vehSpeed_y, plume_non_detect

Sim_time = float_message[0] windSpeed_x = float_message[1] windSpeed_y = float_message[2] con = float_message[3] posx = float_message[4] posy = float_message[5] vx = float_message[6] vy = float_message[7] plume_non_detect = float_message[8]

sim_state = np.array([windSpeed_x, windSpeed_y, vx, vy, plume_non_detect]) sim_reward = np.array(float_message[9]) sim_done = np.array(float_message[10])

old_vx = vx old_vy = vy return(sim_state.astype(np.float32), # state is in the float type np.array(sim_reward, np.int32), # reward is in the int type np.array(sim_done, np.int32)) # done is in the int type

the buffer class implements experience replay

class Buffer: def init(self, buffer_capacity=10000, batch_size=64): self.buffer_capacity = buffer_capacity self.batch_size = batch_size

self.buffer_counter = 0

self.state_buffer = np.zeros((self.buffer_capacity, num_state))
self.action_buffer = np.zeros((self.buffer_capacity, 1), dtype=int)
self.reward_buffer = np.zeros((self.buffer_capacity, 1))
self.next_state_buffer = np.zeros((self.buffer_capacity, num_state))
self.done_buffer = np.zeros((self.buffer_capacity, 1))

take (s,a,r,s') observation tuple as the input

def record(self, obs_tuple): index = self.buffer_counter % self.buffer_capacity

self.state_buffer[index] = obs_tuple[0]
self.action_buffer[index] = obs_tuple[1]
self.reward_buffer[index] = obs_tuple[2]
self.next_state_buffer[index] = obs_tuple[3]
self.done_buffer[index] = obs_tuple[4]

self.buffer_counter += 1

def update(self, state_batch, action_batch, reward_batch, next_state_batch, done_batch): current_Q = q_net(state_batch) # shape [batch_size, num_action] target_Q = np.copy(current_Q) # shape [batch_size, num_action] # next_Q = target_q_net(next_state_batch) # shape [batch_size, num_action] # max_next_Q = np.amax(next_Q, axis=1) # shape [batch_size, 1] next_Q = q_net(next_state_batch) next_optimal_action = np.argmax(next_Q, axis=1) next_Q_target = target_q_net(next_state_batch)

for i in range(self.batch_size):
    target_Q[i][action_batch[i][0]] = reward_batch[i] if done_batch[i] else reward_batch[i] + gamma * \
                                                                            next_Q_target[i][next_optimal_action[i]]
q_net.fit(x=state_batch, y=target_Q, verbose=0)

def learn(self): record_range = min(self.buffer_counter, self.buffer_capacity) batch_indices = np.random.choice(record_range, self.batch_size) # generates a random sample from a 1-D array # choice(a, size): a: 1-D array; size: an integer

# convert to tensors
state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
action_batch = self.action_buffer[batch_indices]
reward_batch = self.reward_buffer[batch_indices]
next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
done_batch = self.done_buffer[batch_indices]

self.update(state_batch, action_batch, reward_batch, next_state_batch, done_batch)


def get_critic(): filter_size = 64 state_input = layers.Input(shape=(num_state)) state_out = layers.Dense(filter_size, activation='relu')(state_input) state_out = layers.Dense(filter_size, activation='relu')(state_out) state_out = layers.Dense(filter_size, activation='relu')(state_out) action_out = layers.Dense(num_actions, activation='linear')(state_out)

model = tf.keras.Model(state_input, action_out) model.compile(optimizer=tf.optimizers.Adam(learning_rate=1e-6), loss='mse') return model

def policy(state, epsilon=0.1): action_q = q_net(state) # action_q is a vector with "number_of_actions" elements if np.random.random() <= epsilon: # np.random.random() returns a random float number between 0 and 1 return random.randrange(num_actions) # random.randrange returns a random selected element from the range else: return np.argmax(action_q.numpy()[0], axis=0)

q_net = get_critic() target_q_net = get_critic()

target_q_net.set_weights(q_net.get_weights())

total_episodes = 1000 gamma = 0.99

buffer = Buffer(10000, 64)

store reward history of each episode

ep_reward_list = []

store average reward of last few episode

avg_reward_list = []

for ep in range(total_episodes): prev_state = env_reset() episodic_reward = 0 time_count = 0 while True: tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)

action = policy(tf_prev_state)

state, reward, done = env_step(action)

buffer.record((prev_state, action, reward, state, done))
episodic_reward += reward

buffer.learn()

# update target_Q network in every 20 steps
if time_count % 20 == 0:
    target_q_net.set_weights(q_net.get_weights())

if done:
    break

prev_state = state
time_count += 1

ep_reward_list.append(episodic_reward)

avg_reward = np.mean(ep_reward_list[-40:]) print("Episode * {} * Episodic Reward is ==> {} and Averaged Reward is ==> {}".format(ep, episodic_reward, avg_reward)) avg_reward_list.append(avg_reward)

model_path = 'C:/Users/Ma/Desktop/GRP' q_net.save(model_path+'q_net.h5', include_optimizer=False) np.savetxt(model_path+'ep_reward.txt', ep_reward_list) np.savetxt(model_path+'avg_reward.txt', avg_reward_list)

plt.plot(avg_reward_list) plt.xlabel("Episode") plt.ylabel("Avg. Epsiodic Reward") plt.show()

Marcel
  • 1

0 Answers0