% Begin your document as any latex file
\documentclass{article}
\usepackage{cs294-40}
\include{cs294-40-macros}
\def\trans{^{\mathsf{T}}}
% Begin the lecture with a line like the following:
% \begin{lecture}{lecture number}{lecture title}{scribe name}
% This replaces the usual \begin{document} that a latex
% file begins with.
\begin{lecture}{19}{Reward Shaping}{P\aa l From}{30/10/2008}
\section{Algorithm Review}
\subsection{Q-learning}
The Q-learning algorithm can be summarized as
\begin{equation}
\begin{tabular}{ll}
initialize & $s_0$\\
for & $t = 1,2,3\dots$\\
& choose an action $a_t$, let's say $\epsilon$-greedy w.r.t. $Q$\\
& execute $a_t$\\
& $Q(s_t,a_t) = (1-\alpha_t)Q(s_t,a_t) + \alpha_t\left[R(s_t,a_t,s_{t+1}) + \gamma\max_{a}Q(s_{t+1},a)\right]$
\end{tabular}
\end{equation}
We can for example choose $\alpha_t = \frac{1}{t+1}$. Although we have not shown it, we know this algorithm converges as long as we visit all the state with probability $P(\cdot)>0$.
\subsection{Value iteration}
Value iteration is summarized as\\\\\vspace{3mm}
\hspace{20mm}iterate\vspace{-5mm}
\begin{align}
&\forall s\hspace{10mm}\bar{V}(s) = \max_{a}\sum_{s'}P(s'|a,s)\left[R(s,a,s') + \gamma V(s')\right]\notag\\
&\forall s\hspace{10mm}V = \bar{V}
\end{align}
which we normally write as $V = TV$.
\section{Example}
We want to find the optimal policy to move from the start to the goal state.
\begin{equation}
\begin{tabular}{l|c|c|c|c|c|}
& S & & & & G \\\hline
Reward for entering state & 0 & 0 & 0 & 0 & 1 \\\hline
State & 1 & 2 & 3 & 4 & 5
\end{tabular}
\end{equation}
\begin{equation}
\begin{tabular}{lcl}
S &-&Start\\
G &-& Goal
\end{tabular}
\end{equation}
We have two actions
\begin{equation}
\begin{tabular}{lcl}
$\hat{L}$ &-&move to the state on the left\\
$\hat{R}$ &-&move to the state on the right
\end{tabular}
\end{equation}
\subsection{Q-learning}
We apply Q-learning to the problem:
We start by initializing $s_0 = 1$ and $a = \hat{R}$
\begin{equation}
Q(1,\hat{R}) = (1-\alpha)\underbrace{Q(1,\hat{R})}_{0} + \alpha\left[\underbrace{R(1,\hat{R},2)}_{0} + \gamma\max_{a}\underbrace{Q(2,a)}_{0}\right] = 0
\end{equation}
we how have $s_1 = 2$ and choose $a = \hat{L}$
\begin{equation}
Q(2,\hat{L}) = (1-\alpha)\underbrace{Q(2,\hat{L})}_{0} + \alpha\left[\underbrace{R(2,\hat{L},1)}_{0} + \gamma\max_{a}\underbrace{Q(1,a)}_{0}\right] = 0
\end{equation}
We see that until we coincidentally happen to hit state 5 $Q$ will always be zero. Very little information is added to the Q-learning algorithm before this happens as there is nothing guiding us towards the right state.
We now propose the alternative reward function
\begin{equation}
\begin{tabular}{l|c|c|c|c|c|}
& S & & & & G \\\hline
Reward for entering state & 0 & $\frac{1}{4}$ & $\frac{2}{4}$ & $\frac{3}{4}$ & 1 \\\hline
State & 1 & 2 & 3 & 4 & 5
\end{tabular}
\end{equation}
We now get
\begin{equation}
Q(1,\hat{R}) = \alpha\cdot\gamma\cdot\frac{1}{4}
\end{equation}
\begin{equation}
Q(2,\hat{L}) = 0
\end{equation}
Then at time 3 we are in state 1 and have
\begin{equation}
Q(1,\hat{R}) = \alpha\cdot\gamma\cdot\frac{1}{4},
\hspace{5mm}Q(1,\hat{L}) = 0
\end{equation}
We see that we can shape the reward to guide us in the right direction.
\subsection{Value Iteration}
\begin{equation}
\begin{matrix}
\textrm{State} & 0 & 1 & 2 & \dots\\
\textrm{Value Function} & \begin{bmatrix}0 \\ 0 \\ 0 \\ 0 \\ 0\end{bmatrix} & \begin{bmatrix}0 \\ 0 \\ 0 \\ 1 \\ 1\end{bmatrix} & \begin{bmatrix}0 \\ 0 \\ \gamma\cdot1 \\ \gamma\cdot1+1 \\ \gamma\cdot1+1\end{bmatrix} & \dots
\end{matrix}
\end{equation}
We do this propagating from back to front. After four iterations we
have found the optimal policy, although we have not yet converged to the value function.
\section{Reward shaping without changing the optimal policy}
So far the approach has been very heuristic. We will now look into how
we can shape the reward function without changing the relative
optimality of policies.
We start by looking at a bad example: let's say we want an agent to
reach a goal state for which it has to climb over three mountains to
get there. The original reward function has a zero reward everywhere,
and a positive reward at the goal state (which is beyond the three
mountains). We could imagine changing the reward by adding a positive
reward for making progress towards the goal by adding a positive
reward when the agent reaches the top of each mountain. Indeed, this
would favor some policies that move the agent towards the goal.
However, it would also favor the following policy: climb to the top of
the first mountain, take a few steps back, go back to the top, and
keep repeating ... In fact, depending on the discounting and the exact
reward function, the latter could even be the optimal policy.
Intuitively, the reward shaping in the example fails because the agent
gets rewarded every time they reach the top of the mountain,
independent of whether they already reached the top before. A natural
solution is to, indeed reward the agent for reaching the top, but also
{\em penalize} the agent for moving away from the goal/top such that
cyclic behaviour results in a zero reward. This suggests shaping the reward function
by using a potential function $\phi$:
\begin{equation}
\bar{R}(s,a,s') = R(s,a,s') + F(s,a,s')
\end{equation}
where
\begin{equation}
F(s,a,s') = \phi(s') - \phi(s).
\end{equation}
For now, we assume:
\begin{enumerate}
\item $\exists$ ``absorbing'' state $s_F$
\item No discounting
\end{enumerate}
The first assumption means that for all policies, we will end up in state $s_t = s_F$ with probability $P = 1$, i.e.
\begin{equation}
\forall \pi\hspace{5mm} \textrm{eventually }s_t = s_F.
\end{equation}
We can remove the restriction on the discounting at a later stage.
Now consider the rewards accumulated during one episode:
\begin{equation}
\label{eq:r}
\sum_{t = 0}^{\tau-1}\left[R(s_t,a_t,s_{t+1}) + \phi(s_{t+1}) - \phi(s_{t})\right]= \left[ \sum_{t = 0}^{\tau-1}R(s_t,a_t,s_{t+1})\right] + \phi(s_{F}) - \phi(s_{0})
\end{equation}
Let $\bar{M}$ denote the MDP identical to the original MDP $M$, except
for the reward function: in $\bar{M}$ we have the shaped reward
function. Then the above can be written as:
\begin{equation}
\forall \pi: V_{\bar{M}}^{\pi}(s) = V_{M}^{\pi}(s) + \phi(s_{\tau}) - \phi(s_0)
\end{equation}
This implies that for all policies $\pi_1, \pi_2$ we have that the
ordering of their value is preserved between $M$ and $\bar{M}$:
\begin{equation}
V_{M}^{\pi_1}(s) > V_{M}^{\pi_2}(s) \Longrightarrow V_{\bar{M}}^{\pi_1}(s) > V_{\bar{M}}^{\pi_2}(s)
\end{equation}
Hence reward shaping based upon differencing a potential function has
the desired property of keeping the optimality ordering of policies
invariant.
\subsection{Infinite horizon}
We re-write (\ref{eq:r}) with the discount factor
\begin{equation}
\sum_{t = 0}^{\tau-1}\gamma^t\left[R(s_t,a_t,s_{t+1}) + \phi(s_{t+1}) - \phi(s_{t})\right]
\end{equation}
We write out
\begin{equation}
\sum_{t = 0}^{\tau-1}\gamma^t\left[\phi(s_{t+1}) - \phi(s_{t})\right] = (\phi(s_1) - \phi(s_0)) + (\gamma\phi(s_2) - \gamma\phi(s_1)) + (\gamma^2\phi(s_3) - \gamma^2\phi(s_2))\dots
\end{equation}
We see that if we multiply the first part with $\gamma$, we get
\begin{align}
\sum_{t = 0}^{\tau-1}\gamma^t\left[\gamma\phi(s_{t+1}) - \phi(s_{t})\right] &= \gamma\phi(s_1) - \phi(s_0) + \gamma^2\phi(s_2) - \gamma\phi(s_1) + \gamma^3\phi(s_3) - \gamma^2\phi(s_2)\dots\notag\\
&= - \phi(s_0) + \underbrace{(\gamma\phi(s_1) - \gamma\phi(s_1))}_{0} + \underbrace{(\gamma^2\phi(s_2) - \gamma^2\phi(s_2))}_{0} + \gamma^{\tau-1}\phi(s_{\tau})\dots
\end{align}
So we choose
\begin{equation}
F(s,a,s') = \gamma\phi(s') - \phi(s).
\end{equation}
\begin{proposition}
Reward shaping with the function $F(s,a,s') = \gamma\phi(s') -
\phi(s)$ leaves the optimality ordering of policies invariant.
\begin{proof}
Let $M$ be the original MDP, and let $\bar{M}$ be identical except for
the reward function being shaped by $F(s,a,s') = \gamma\phi(s') -
\phi(s)$.
\begin{equation}
Q_M^*(s,a) = \sum_{s'}P(s'\,|\,s,a)\left[R(s,a,s') + \gamma\max_{a}Q_M^*(s,a)\right]
\end{equation}
we add the shaping (by adding and subtracting the same terms)
\begin{equation}
\underbrace{Q_M^*(s,a) - \phi(s)}_{Q_{\bar{M}}^*(s,a)} =
\sum_{s'}P(s'\,|\,s,a)\left[\underbrace{R(s,a,s') + \gamma\phi(s') -
\phi(s)}_{\textrm{Shaped Reward Function }R_{\bar{M}}(s,a,s')} +
\gamma\max_{a}\underbrace{(Q_M^*(s,a) -
\phi(s'))}_{Q_{\bar{M}}^*(s',a)}\right]
\end{equation}
For $\bar{M}$ we get
\begin{equation}
Q_{\bar{M}}^*(s,a) = \sum_{s'}P(s'\,|\,s,a)\left[R_{\bar{M}}(s,a,s') + \gamma\max_{a}Q_{\bar{M}}^*(s',a)\right]
\end{equation}
We note that $Q_{\bar{M}}^*(s,a)=Q_M^*(s,a) - \phi(s)$ satisfies the Bellman equation for the reward $R_{\bar{M}}(s,a,s')=R(s,a,s') + \gamma\phi(s') - \phi(s)$.
Thus we have
%
\begin{gather}
Q_{\bar{M}}^*(s,a) = Q_{M}^*(s,a) - \phi(s)\notag\\
\arg\max_{a}Q_{\bar{M}}^*(s,a) = \arg\max_{a}Q_{M}^*(s,a)
\end{gather}
as we wanted and the optimal policy is preserved.
Now consider a special MDP, in which only 1 action is available in
each state, namely the action prescribed by a policy $\pi$. Then the
same reasoning as above goes through, and we obtain:
\begin{equation}
Q_{\bar{M}}^{\pi}(s,a) = Q_{M}^{\pi}(s,a) - \phi(s).
\end{equation}
This holds true for any policy $\pi$, hence for any pair of policies $\pi_1$, $\pi_2$ we have:
\begin{equation}
Q_{M}^{\pi_1}(s, \pi_1(s)) \geq Q_{M}^{\pi_2}(s, \pi_2(s))
\end{equation}
implies:
\begin{equation}
Q_{M}^{\pi_1}(s, \pi_1(s)) -\phi(s) \geq Q_{M}^{\pi_2}(s, \pi_2(s)) - \phi(s)
\end{equation}
hence:
\begin{equation}
Q_{\bar{M}}^{\pi_1}(s, \pi_1(s)) \geq Q_{\bar{M}}^{\pi_2}(s, \pi_2(s)).
\end{equation}
\end{proof}
\end{proposition}
\section{What is ideal shaping?}
Consider value iteration, where we iteratively compute the value function as follows:
for $k=0,1,\ldots $
$\forall s\hspace{10mm} V_{k+1}(s) = \max_{a}\sum_{s'}P(s'|a,s)\left[R(s,a,s') + \gamma V_k(s')\right]$\\
With reward shaping we have:
\[
V_{k+1}(s) = \max_{a}\sum_{s'}P(s'|a,s)\left[R(s,a,s') + \gamma \phi(s') - \phi(s) + \gamma V_k(s')\right]
\]
Assume we choose $V_0 = 0$, and $\phi = V^*$, then we obtain:
\[
V_{1}(s) = \max_{a}\sum_{s'}P(s'|a,s)\left[R(s,a,s') + \gamma V^*(s') - V^*(s) + 0 \right] = V^*(s) - V^*(s) = 0
\]
And similarly, for all $k>0$, we have $V_k(s) = 0$ for all states $s$. Note we also have:
\[
\arg \max_{a}\sum_{s'}P(s'|a,s)\left[R(s,a,s') + \gamma V^*(s') - V^*(s) + V_k(s') \right] =
\arg \max_{a}\sum_{s'}P(s'|a,s)\left[R(s,a,s') + \gamma V^*(s') \right] = \pi^*(s)
\]
Hence, from the first iteration onwards, we have converged to the
optimal value function and optimal policy.
This also suggests there is a close connection between initializing the value function and potential shaping.
\end{lecture}
\theend